DWD1211 commited on
Commit
ad25769
Β·
verified Β·
1 Parent(s): 9c2101c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +141 -14
app.py CHANGED
@@ -1,20 +1,147 @@
1
  import streamlit as st
2
- from transformers import pipeline
 
3
 
4
- def main():
5
- sentiment_pipeline = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- st.title("Sentiment Analysis with HuggingFace Spaces")
8
- st.write("Enter a sentence to analyze its sentiment:")
9
 
10
- user_input = st.text_input("")
11
- if user_input:
12
- result = sentiment_pipeline(user_input)
13
- sentiment = result[0]["label"]
14
- confidence = result[0]["score"]
15
 
16
- st.write(f"Sentiment: {sentiment}")
17
- st.write(f"Confidence: {confidence:.2f}")
18
 
19
- if __name__ == "__main__":
20
- main()
 
1
  import streamlit as st
2
+ import pdfplumber, re
3
+ from transformers import pipeline, AutoTokenizer
4
 
5
+ # ───────────────── Cached pipelines ────────────────────────────────────
6
+ @st.cache_resource(ttl=86400)
7
+ def load_pipes():
8
+ summarizer = pipeline("summarization", model=SUMM_MODEL)
9
+ tokenizer = AutoTokenizer.from_pretrained( SUMM_MODEL)
10
+ sentiment = pipeline("text-classification", model=SENT_MODEL)
11
+ ner = pipeline("token-classification", model=NER_MODEL,
12
+ aggregation_strategy="simple")
13
+ return summarizer, tokenizer, sentiment, ner
14
+
15
+ # ───────────────── Helper functions ────────────────────────────────────
16
+ def split_by_tokens(text, max_tokens):
17
+ words = re.split(r"(\s+)", text)
18
+ buf, n = "", 0
19
+ for w in words:
20
+ ln = len(TOK(w).input_ids)
21
+ if n + ln <= max_tokens:
22
+ buf, n = buf + w, n + ln
23
+ else:
24
+ yield buf.strip(); buf, n = w, ln
25
+ if buf.strip(): yield buf.strip()
26
+
27
+ def summarise(text):
28
+ parts = list(split_by_tokens(text, MAX_TOK))
29
+ per_len = max(25, min(80, TARGET_WORDS // max(1, len(parts))))
30
+ first = [SUMMAR(p, max_length=per_len,
31
+ min_length=per_len//2,
32
+ do_sample=False)[0]["summary_text"]
33
+ for p in parts]
34
+ joined = " ".join(first)
35
+ if len(joined.split()) > TARGET_WORDS:
36
+ joined = SUMMAR(joined, max_length=TARGET_WORDS,
37
+ min_length=TARGET_WORDS//2,
38
+ do_sample=False)[0]["summary_text"]
39
+ return joined
40
+
41
+ def shorten(summary, n):
42
+ s = summary.split(". ")
43
+ return (". ".join(s[:n]).rstrip(".") + ".") if len(s) > n else summary
44
+
45
+ def extract_pdf(file):
46
+ txt=""
47
+ with pdfplumber.open(file) as pdf:
48
+ for p in pdf.pages: txt += p.extract_text() or ""
49
+ return txt
50
+
51
+ def tag_entities(text):
52
+ tt = {"Organization":[], "Person":[], "Location":[], "Miscellaneous":[]}
53
+ for e in NER(text):
54
+ grp = {"ORG":"Organization","PER":"Person",
55
+ "LOC":"Location"}.get(e["entity_group"],"Miscellaneous")
56
+ tt[grp].append(e["word"])
57
+ return {k: sorted(set(v)) for k,v in tt.items() if v}
58
+
59
+ # ───────────────── Main Part ───────────────────────────────────────
60
+ st.set_page_config(page_title="Financial News Analyzer",
61
+ page_icon="πŸ“°",
62
+ layout="wide")
63
+ st.title("πŸ“° Financial News Analyzer")
64
+ st.markdown("##### Instantly grasp news content, sentiment, and relevant entities")
65
+
66
+ # models and other constant variables
67
+ SUMM_MODEL = "sshleifer/distilbart-cnn-12-6"
68
+ SENT_MODEL = "nynn/Fintuned_Sentiment"
69
+ NER_MODEL = "Babelscape/wikineural-multilingual-ner"
70
+ SUMMAR, TOK, SENT_CLF, NER = load_pipes()
71
+
72
+ MAX_TOK = 1024
73
+ TARGET_WORDS = 225
74
+ LABEL_MAP = {"LABEL_0":"Negative","LABEL_1":"Positive","LABEL_2":"Neutral"}
75
+ COLOR_MAP = {"Positive":"green","Negative":"red","Neutral":"gray"}
76
+
77
+ # ───────────────── Sidebar input ───────────────────────────────────────
78
+ with st.sidebar:
79
+ st.header("Input News to Analyze:")
80
+ txt_input = st.text_area("Paste news article", height=150)
81
+ pdf_file = st.file_uploader("Or upload PDF", type=["pdf"])
82
+ sent_count = st.slider("Summary length (sentences)",
83
+ min_value=1, max_value=5, value=3, step=1)
84
+ run_btn = st.button("πŸ” Analyze", use_container_width=True)
85
+
86
+ raw_text = extract_pdf(pdf_file) if pdf_file else txt_input.strip()
87
+
88
+ # ───────────────── Main pipeline ───────────────────────────────────────
89
+ if run_btn:
90
+ if not raw_text:
91
+ st.warning("Please provide text or a PDF first.")
92
+ st.stop()
93
+
94
+ with st.spinner("Analyzing"):
95
+ full_sum = summarise(raw_text)
96
+ summary = shorten(full_sum, sent_count)
97
+
98
+ cols = st.columns([2,1])
99
+ with cols[0]:
100
+ st.subheader("πŸ“ Summary")
101
+ st.write(summary)
102
+
103
+ with cols[1]:
104
+ res = SENT_CLF(summary)[0]
105
+ label = LABEL_MAP.get(res["label"], res["label"])
106
+ colour= COLOR_MAP[label]
107
+ st.subheader("πŸ“Š Sentiment")
108
+ st.markdown(f"<h3 style='color:{colour};margin-bottom:0'>{label}</h3>"
109
+ f"{res['score']*100:.1f}% Confidence</p>",
110
+ unsafe_allow_html=True)
111
+
112
+ tags = tag_entities(summary)
113
+ st.subheader("🏷️ Relevant Tags")
114
+
115
+ if tags:
116
+ # CSS for the badge pills
117
+ pill_css = """
118
+ <style>
119
+ .tag-pill {
120
+ display: inline-block;
121
+ background: #f0f2f6;
122
+ color: #333;
123
+ padding: 4px 10px;
124
+ margin: 2px 4px 2px 0;
125
+ border-radius: 12px;
126
+ font-size: 0.9em;
127
+ }
128
+ .tag-cat {
129
+ font-weight: 600;
130
+ margin-top: 0;
131
+ margin-bottom: 4px;
132
+ }
133
+ </style>
134
+ """
135
+ st.markdown(pill_css, unsafe_allow_html=True)
136
+
137
+ # Render each category as a header + pills
138
+ for category, vals in tags.items():
139
+ st.markdown(f"<div class='tag-cat'>{category}</div>", unsafe_allow_html=True)
140
+ pills = "".join(f"<span class='tag-pill'>{v}</span>" for v in vals)
141
+ st.markdown(pills, unsafe_allow_html=True)
142
+ else:
143
+ st.info("No entities detected.")
144
 
 
 
145
 
 
 
 
 
 
146
 
 
 
147