Spaces:
Sleeping
Sleeping
import streamlit as st | |
import pdfplumber, re | |
from transformers import pipeline, AutoTokenizer | |
# βββββββββββββββββ Cached pipelines ββββββββββββββββββββββββββββββββββββ | |
def load_pipes(): | |
summarizer = pipeline("summarization", model=SUMM_MODEL) | |
tokenizer = AutoTokenizer.from_pretrained( SUMM_MODEL) | |
sentiment = pipeline("text-classification", model=SENT_MODEL) | |
ner = pipeline("token-classification", model=NER_MODEL, | |
aggregation_strategy="simple") | |
return summarizer, tokenizer, sentiment, ner | |
# βββββββββββββββββ Helper functions ββββββββββββββββββββββββββββββββββββ | |
def split_by_tokens(text, max_tokens): | |
words = re.split(r"(\s+)", text) | |
buf, n = "", 0 | |
for w in words: | |
ln = len(TOK(w).input_ids) | |
if n + ln <= max_tokens: | |
buf, n = buf + w, n + ln | |
else: | |
yield buf.strip(); buf, n = w, ln | |
if buf.strip(): yield buf.strip() | |
def summarise(text): | |
parts = list(split_by_tokens(text, MAX_TOK)) | |
per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts)))) | |
first = [SUMMAR(p, max_length=per_len, | |
min_length=per_len//2, | |
do_sample=False)[0]["summary_text"] | |
for p in parts] | |
joined = " ".join(first) | |
if len(joined.split()) > TARGET_WORDS: | |
joined = SUMMAR(joined, max_length=TARGET_WORDS, | |
min_length=TARGET_WORDS//2, | |
do_sample=False)[0]["summary_text"] | |
return joined | |
def shorten(summary, n): | |
s = summary.split(". ") | |
return (". ".join(s[:n]).rstrip(".") + ".") if len(s) > n else summary | |
def extract_pdf(file): | |
txt="" | |
with pdfplumber.open(file) as pdf: | |
for p in pdf.pages: txt += p.extract_text() or "" | |
return txt | |
def tag_entities(text): | |
tt = {"Organization":[], "Person":[], "Location":[], "Miscellaneous":[]} | |
for e in NER(text): | |
grp = {"ORG":"Organization","PER":"Person", | |
"LOC":"Location"}.get(e["entity_group"],"Miscellaneous") | |
tt[grp].append(e["word"]) | |
return {k: sorted(set(v)) for k,v in tt.items() if v} | |
# βββββββββββββββββ Main Part βββββββββββββββββββββββββββββββββββββββ | |
st.set_page_config(page_title="Financial News Analyzer", | |
page_icon="π°", | |
layout="wide") | |
st.title("π° Financial News Analyzer") | |
st.markdown("##### Quickly grasp content, sentiment, and relevant entities from news") | |
# models and other constant variables | |
SUMM_MODEL = "sshleifer/distilbart-cnn-12-6" | |
SENT_MODEL = "nynn/Fintuned_Sentiment" | |
NER_MODEL = "Babelscape/wikineural-multilingual-ner" | |
SUMMAR, TOK, SENT_CLF, NER = load_pipes() | |
MAX_TOK = 1024 | |
TARGET_WORDS = 225 | |
LABEL_MAP = {"LABEL_0":"Negative","LABEL_1":"Positive","LABEL_2":"Neutral"} | |
COLOR_MAP = {"Positive":"green","Negative":"red","Neutral":"gray"} | |
# βββββββββββββββββ Sidebar input βββββββββββββββββββββββββββββββββββββββ | |
with st.sidebar: | |
st.header("Input News to Analyze:") | |
txt_input = st.text_area("Paste news article", height=300) | |
pdf_file = st.file_uploader("Or upload PDF", type=["pdf"]) | |
sent_count = st.slider("Summary length (sentences)", | |
min_value=1, max_value=5, value=3, step=1) | |
run_btn = st.button("π Analyze", use_container_width=True) | |
raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip() | |
# βββββββββββββββββ Main pipeline βββββββββββββββββββββββββββββββββββββββ | |
if run_btn: | |
if not raw_text: | |
st.warning("Please provide text or a PDF first.") | |
st.stop() | |
with st.spinner("Analyzing"): | |
full_sum = summarise(raw_text) | |
summary = shorten(full_sum, sent_count) | |
cols = st.columns([2,1]) | |
with cols[0]: | |
st.subheader("π Summary") | |
st.write(summary) | |
with cols[1]: | |
res = SENT_CLF(summary)[0] | |
label = LABEL_MAP.get(res["label"], res["label"]) | |
colour= COLOR_MAP[label] | |
st.subheader("π Sentiment") | |
st.markdown(f"<h3 style='color:{colour};margin-bottom:0'>{label}</h3>" | |
f"{res['score']*100:.1f}% Confidence</p>", | |
unsafe_allow_html=True) | |
tags = tag_entities(summary) | |
st.subheader("π·οΈ Relevant Tags") | |
if tags: | |
# CSS for the badge pills | |
pill_css = """ | |
<style> | |
.tag-pill { | |
display: inline-block; | |
background: #f0f2f6; | |
color: #333; | |
padding: 4px 10px; | |
margin: 2px 4px 2px 0; | |
border-radius: 12px; | |
font-size: 0.9em; | |
} | |
.tag-cat { | |
font-weight: 600; | |
margin-top: 0; | |
margin-bottom: 4px; | |
} | |
</style> | |
""" | |
st.markdown(pill_css, unsafe_allow_html=True) | |
# Render each category as a header + pills | |
for category, vals in tags.items(): | |
st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True) | |
pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in vals) | |
st.markdown(pills, unsafe_allow_html=True) | |
else: | |
st.info("No entities detected.") | |