Spaces:

ChatBotsTA
/

pdf-summarizer

Sleeping

App Files Files Community

ChatBotsTA commited on 23 days ago

Commit

a257837

verified ·

1 Parent(s): 12a2cb7

Update app.py

Browse files

Files changed (1) hide show

app.py +131 -205

app.py CHANGED Viewed

@@ -1,215 +1,141 @@
-import os, io, re, json, base64, requests, numpy as np
 import streamlit as st
-from pypdf import PdfReader
-import matplotlib.pyplot as plt
-# -----------------------------
-# Config
-# -----------------------------
-st.set_page_config(page_title="PDF Summarizer + Audio + QA", page_icon="📄", layout="wide")
-HF_TOKEN = os.environ.get("HF_TOKEN", "")
-HEADERS_JSON = {
-    "Authorization": f"Bearer {HF_TOKEN}" if HF_TOKEN else "",
-    "Content-Type": "application/json",
-    "Accept": "application/json",
-}
-SUMMARIZER_MODEL = "pszemraj/long-t5-tglobal-base-16384-book-summary"
-TTS_MODELS = [
-    "espnet/kan-bayashi_ljspeech_vits",
-    "facebook/fastspeech2-en-ljspeech"
-]
-EMB_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-QA_MODEL = "deepset/roberta-base-squad2"
-# -----------------------------
-# API helpers
-# -----------------------------
-def hf_infer_json(model_id: str, payload: dict, router=False, accept=None):
-    if router:
-        url = f"https://router.huggingface.co/hf-inference/models/{model_id}"
-    else:
-        url = f"https://api-inference.huggingface.co/models/{model_id}"
-    headers = HEADERS_JSON.copy()
-    if accept:
-        headers["Accept"] = accept
-    r = requests.post(url, headers=headers, data=json.dumps(payload), timeout=120)
-    r.raise_for_status()
     try:
-        return r.json()
-    except requests.exceptions.JSONDecodeError:
-        return r.content
-def split_into_chunks(text: str, max_chars: int = 1500, overlap: int = 200):
-    text = re.sub(r"\s+", " ", text).strip()
-    chunks = []
-    i = 0
-    while i < len(text):
-        chunk = text[i:i+max_chars]
-        last_dot = chunk.rfind(". ")
-        if last_dot > 400:
-            chunk = chunk[:last_dot+1]
-            i += last_dot + 1 - overlap
-        else:
-            i += max_chars - overlap
-        chunks.append(chunk.strip())
-    return [c for c in chunks if c]
-def embed_texts(texts):
-    url = f"https://router.huggingface.co/hf-inference/models/{EMB_MODEL}/pipeline/feature-extraction"
-    headers = HEADERS_JSON
-    r = requests.post(url, headers=headers, data=json.dumps({"inputs": texts}), timeout=120)
-    r.raise_for_status()
-    arr = np.array(r.json(), dtype=np.float32)
-    if arr.ndim == 2:
-        return arr.mean(axis=0, keepdims=True)
-    if arr.ndim == 3:
-        pooled = [a.mean(axis=0) for a in arr]
-        return np.vstack(pooled)
-    return np.array(arr)
-def cosine_sim(a, b):
-    a = a / (np.linalg.norm(a, axis=-1, keepdims=True) + 1e-8)
-    b = b / (np.linalg.norm(b, axis=-1, keepdims=True) + 1e-8)
-    return a @ b.T
-def summarize_long_text(text: str):
-    chunks = split_into_chunks(text)
-    mini_summaries = []
-    for c in chunks:
-        out = hf_infer_json(SUMMARIZER_MODEL, {"inputs": c}, router=False)
-        if isinstance(out, list) and len(out) and "summary_text" in out[0]:
-            mini_summaries.append(out[0]["summary_text"])
-        else:
-            mini_summaries.append(c[:800])
-    return " ".join(mini_summaries), chunks
-def tts_wav_bytes(text: str) -> bytes:
-    for model in TTS_MODELS:
-        try:
-            res = hf_infer_json(model, {"inputs": text}, router=False, accept="audio/wav")
-            if isinstance(res, (bytes, bytearray)):
-                return res
-            if isinstance(res, dict) and "audio" in res:
-                return base64.b64decode(res["audio"])
-        except Exception:
-            continue
-    raise RuntimeError("All TTS models failed.")
-def extract_text_from_pdf(file) -> str:
-    reader = PdfReader(file)
-    pages = []
-    for p in reader.pages:
-        try:
-            pages.append(p.extract_text() or "")
-        except:
-            pages.append("")
-    return "\n".join(pages)
-def make_word_freq_chart(text: str, top_k=20):
-    text = text.lower()
-    stop = set(("the a an and of to in is are for with on by as at this that from be was were it its it’s into or if not your you we they their our can may such more most other also than which".split()))
-    tokens = re.findall(r"[a-zA-Z]{3,}", text)
-    freq = {}
-    for t in tokens:
-        if t in stop:
-            continue
-        freq[t] = freq.get(t, 0) + 1
-    items = sorted(freq.items(), key=lambda x: x[1], reverse=True)[:top_k]
-    if not items:
-        st.info("Not enough text to show a frequency chart.")
-        return
-    words, counts = zip(*items)
-    fig = plt.figure()
-    plt.bar(words, counts)
-    plt.xticks(rotation=60, ha="right")
-    plt.title("Top word frequencies")
-    plt.tight_layout()
-    st.pyplot(fig)
-# -----------------------------
-# UI
-# -----------------------------
-st.title("📄 PDF → Summary · 🔊 Audio · 📊 Chart · ❓ Q&A")
-st.caption("Free models via Hugging Face Hosted Inference API.")
-uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
-if "doc_text" not in st.session_state:
-    st.session_state.doc_text = ""
-    st.session_state.chunks = []
-    st.session_state.chunk_vecs = None
-    st.session_state.summary = ""
 if uploaded:
-    with st.spinner("Extracting text..."):
-        text = extract_text_from_pdf(uploaded)
-        st.session_state.doc_text = text
-        st.success(f"Loaded {len(text)} characters.")
-    st.write("### Actions")
-    with st.container():
-        if st.button("📝 Summarize"):
-            with st.spinner("Summarizing..."):
-                summary, chunks = summarize_long_text(st.session_state.doc_text)
-                st.session_state.summary = summary
-                st.session_state.chunks = chunks
-            st.success("Summary ready.")
-            st.write("#### Summary")
-            st.write(st.session_state.summary)
-    with st.container():
-        if st.button("🔊 Generate Audio (summary)"):
-            target_text = st.session_state.summary or st.session_state.doc_text[:1200]
-            with st.spinner("Generating audio..."):
                 try:
-                    wav = tts_wav_bytes(target_text)
-                    st.audio(wav, format="audio/wav")
-                    st.success("Audio ready.")
                 except Exception as e:
                     st.error(f"TTS failed: {e}")
-    with st.container():
-        if st.button("📊 Show Word-Frequency Chart"):
-            with st.spinner("Building chart..."):
-                make_word_freq_chart(st.session_state.doc_text)
-    st.write("---")
-    st.subheader("Ask questions about the PDF")
-    question = st.text_input("Your question")
-    if st.button("Answer"):
-        if not st.session_state.chunks:
-            st.session_state.chunks = split_into_chunks(st.session_state.doc_text)
-        with st.spinner("Thinking..."):
             try:
-                if st.session_state.chunk_vecs is None:
-                    vecs = embed_texts(st.session_state.chunks)
-                    st.session_state.chunk_vecs = vecs
-                else:
-                    vecs = st.session_state.chunk_vecs
-                q_vec = embed_texts([question])
-                sims = cosine_sim(q_vec, vecs).flatten()
-                top_idx = np.argsort(sims)[::-1][:3]
-                context = "\n".join([st.session_state.chunks[i] for i in top_idx])
-                qa_out = hf_infer_json(QA_MODEL, {"inputs": {"question": question, "context": context}}, router=False)
-                if isinstance(qa_out, dict):
-                    ans = qa_out.get("answer", "")
-                    score = qa_out.get("score", 0.0)
-                elif isinstance(qa_out, list) and len(qa_out) and isinstance(qa_out[0], dict):
-                    ans = qa_out[0].get("answer", "")
-                    score = qa_out[0].get("score", 0.0)
-                else:
-                    ans, score = "", 0.0
-                st.write("**Answer:**", ans or "_(no confident answer)_")
-                st.caption(f"Confidence: {score:.3f}")
-                with st.expander("Context used"):
-                    st.write(context)
             except Exception as e:
-                st.error(f"QA failed: {e}")
-else:
-    st.info("Upload a PDF to get started.")

+# app.py
+import os
+import io
+import tempfile
 import streamlit as st
+from huggingface_hub import InferenceClient
+import pdfplumber
+from PIL import Image
+import base64
+# ---------- Configuration ----------
+HF_TOKEN = os.environ.get("HF_TOKEN")  # required
+GROQ_KEY = os.environ.get("GROQ_API_KEY")  # optional: if you want to call Groq directly
+USE_GROQ_PROVIDER = True  # set False to route to default HF provider
+# model IDs (change if you prefer other models)
+LLAMA_MODEL = "Groq/Llama-3-Groq-8B-Tool-Use"        # Groq Llama model on HF
+TTS_MODEL = "espnet/kan-bayashi_ljspeech_vits"       # a HF-hosted TTS model example
+SDXL_MODEL = "stabilityai/stable-diffusion-xl-base-1.0"  # SDXL base model
+# create Inference client (route via HF token by default)
+if USE_GROQ_PROVIDER:
+    client = InferenceClient(provider="groq", api_key=HF_TOKEN)
+else:
+    client = InferenceClient(api_key=HF_TOKEN)
+# ---------- Helpers ----------
+def pdf_to_text(uploaded_file) -> str:
+    text_chunks = []
+    with pdfplumber.open(uploaded_file) as pdf:
+        for page in pdf.pages:
+            ptext = page.extract_text()
+            if ptext:
+                text_chunks.append(ptext)
+    return "\n\n".join(text_chunks)
+def llama_summarize(text, max_tokens=512):
+    prompt = [
+        {"role": "system", "content": "You are a concise summarizer. Produce a clear summary in bullet points."},
+        {"role": "user", "content": f"Summarize the following document in <= 8 bullet points. Keep it short:\n\n{text}"}
+    ]
+    # Use chat completion endpoint style
+    resp = client.chat.completions.create(model=LLAMA_MODEL, messages=prompt)
     try:
+        summary = resp.choices[0].message["content"]
+    except Exception:
+        # fallback: try text generation field
+        summary = resp.choices[0].text if hasattr(resp.choices[0], "text") else str(resp)
+    return summary
+def llama_chat(chat_history, user_question):
+    messages = chat_history + [{"role":"user","content":user_question}]
+    resp = client.chat.completions.create(model=LLAMA_MODEL, messages=messages)
+    return resp.choices[0].message["content"]
+def tts_synthesize(text) -> bytes:
+    # InferenceClient offers text->audio utilities. This returns raw audio bytes (wav).
+    audio_bytes = client.text_to_speech(model=TTS_MODEL, inputs=text)
+    return audio_bytes
+def generate_image(prompt_text) -> Image.Image:
+    img_bytes = client.text_to_image(prompt_text, model=SDXL_MODEL)
+    return Image.open(io.BytesIO(img_bytes))
+def audio_download_button(wav_bytes, filename="summary.wav"):
+    b64 = base64.b64encode(wav_bytes).decode()
+    href = f'<a href="data:audio/wav;base64,{b64}" download="{filename}">Download audio (WAV)</a>'
+    st.markdown(href, unsafe_allow_html=True)
+# ---------- Streamlit UI ----------
+st.set_page_config(page_title="PDFGPT (Groq + HF)", layout="wide")
+st.title("PDF → Summary + Speech + Chat + Diagram (Groq + HF)")
+uploaded = st.file_uploader("Upload PDF", type=["pdf"])
 if uploaded:
+    with st.spinner("Extracting text from PDF..."):
+        text = pdf_to_text(uploaded)
+    st.subheader("Extracted text (preview)")
+    st.text_area("Document text", value=text[:1000], height=200)
+    if st.button("Create summary (Groq Llama)"):
+        with st.spinner("Summarizing with Groq Llama..."):
+            summary = llama_summarize(text)
+        st.subheader("Summary")
+        st.write(summary)
+        st.session_state["summary"] = summary
+    if "summary" in st.session_state:
+        summary = st.session_state["summary"]
+        if st.button("Synthesize audio from summary (TTS)"):
+            with st.spinner("Creating audio..."):
                 try:
+                    audio = tts_synthesize(summary)
+                    st.audio(audio)
+                    audio_download_button(audio)
                 except Exception as e:
                     st.error(f"TTS failed: {e}")
+    st.markdown("---")
+    st.subheader("Chat with your PDF (ask questions about document)")
+    if "chat_history" not in st.session_state:
+        # start with system + doc context (shortened)
+        doc_context = (text[:4000] + "...") if len(text) > 4000 else text
+        st.session_state["chat_history"] = [
+            {"role":"system","content":"You are a helpful assistant that answers questions based on the provided document."},
+            {"role":"user","content": f"Document context:\n{doc_context}"}
+        ]
+    user_q = st.text_input("Ask a question about the PDF")
+    if st.button("Ask") and user_q:
+        with st.spinner("Getting answer from Groq Llama..."):
+            answer = llama_chat(st.session_state["chat_history"], user_q)
+            st.session_state.setdefault("convo", []).append(("You", user_q))
+            st.session_state.setdefault("convo", []).append(("Assistant", answer))
+            # append to history for next calls
+            st.session_state["chat_history"].append({"role":"user","content":user_q})
+            st.session_state["chat_history"].append({"role":"assistant","content":answer})
+            st.write(answer)
+    st.markdown("---")
+    st.subheader("Generate a diagram from your question (SDXL)")
+    diagram_prompt = st.text_input("Describe the diagram or scene to generate")
+    if st.button("Generate diagram") and diagram_prompt:
+        with st.spinner("Generating image (SDXL)..."):
             try:
+                img = generate_image(diagram_prompt)
+                st.image(img, use_column_width=True)
+                # allow download
+                buf = io.BytesIO()
+                img.save(buf, format="PNG")
+                st.download_button("Download diagram (PNG)", data=buf.getvalue(), file_name="diagram.png", mime="image/png")
             except Exception as e:
+                st.error(f"Image generation failed: {e}")
+st.sidebar.title("Settings")
+st.sidebar.write("Models in use:")
+st.sidebar.write(f"LLM: {LLAMA_MODEL}")
+st.sidebar.write(f"TTS: {TTS_MODEL}")
+st.sidebar.write(f"Image: {SDXL_MODEL}")
+st.sidebar.markdown("**Notes**\n- Set HF_TOKEN in Space secrets or environment before starting.\n- To route directly to Groq with your Groq API key, set `GROQ_API_KEY` and change the client init accordingly.")