|
import streamlit as st |
|
import pandas as pd |
|
import re |
|
import nltk |
|
from PIL import Image |
|
import os |
|
import numpy as np |
|
import seaborn as sns |
|
from wordcloud import WordCloud, STOPWORDS |
|
from nltk.corpus import stopwords |
|
import datasets |
|
from datasets import load_dataset |
|
import matplotlib.pyplot as plt |
|
import sklearn |
|
from sklearn.preprocessing import LabelEncoder |
|
sns.set_palette("RdBu") |
|
|
|
dataset = load_dataset("merve/poetry", streaming=True) |
|
df = pd.DataFrame.from_dict(dataset["train"]) |
|
|
|
|
|
d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd() |
|
nltk.download("stopwords") |
|
stop = stopwords.words('english') |
|
|
|
|
|
|
|
def standardize(text, remove_digits=True): |
|
text=re.sub('[^a-zA-Z\d\s]', '',text) |
|
text = text.lower() |
|
|
|
return text |
|
st.set_option('deprecation.showPyplotGlobalUse', False) |
|
st.write("Poetry dataset, content column cleaned from special characters and lowercased") |
|
df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) |
|
df.content=df.content.apply(standardize) |
|
st.dataframe(df) |
|
|
|
st.subheader("Visualization on dataset statistics") |
|
|
|
st.write("Number of poems written in each type") |
|
sns.catplot(x="type", data=df, kind="count") |
|
plt.xticks(rotation=0) |
|
st.pyplot() |
|
|
|
st.write("Number of poems for each age") |
|
sns.catplot(x="age", data=df, kind="count") |
|
plt.xticks(rotation=0) |
|
st.pyplot() |
|
|
|
st.write("Number of poems for each author") |
|
sns.catplot(x="author", data=df, kind="count", aspect = 4) |
|
plt.xticks(rotation=90) |
|
st.pyplot() |
|
|
|
|
|
st.write("Distributions of poem types according to ages and authors, seems that folks in renaissance loved the love themed poems and nature themed poems became popular later") |
|
le = LabelEncoder() |
|
|
|
df.author = le.fit_transform(df.author) |
|
sns.catplot(x="age", y="author",hue="type", data=df) |
|
st.pyplot() |
|
|
|
|
|
|
|
|
|
words = df.content.str.split(expand=True).unstack().value_counts() |
|
renaissance = df.content.loc[df.age == "Renaissance"].str.split(expand=True).unstack().value_counts() |
|
modern = df.content.loc[df.age == "Modern"].str.split(expand=True).unstack().value_counts() |
|
st.subheader("Visualizing content") |
|
mask = np.array(Image.open(os.path.join(d, "poet.png"))) |
|
|
|
import matplotlib.pyplot as plt |
|
def word_cloud(content, title): |
|
wc = WordCloud(background_color="white", max_words=200,contour_width=3, |
|
stopwords=STOPWORDS, max_font_size=50) |
|
wc.generate(" ".join(content.index.values)) |
|
fig = plt.figure(figsize=(10, 10)) |
|
plt.title(title, fontsize=20) |
|
plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98) |
|
plt.axis('off') |
|
st.pyplot() |
|
|
|
st.subheader("Most appearing words excluding stopwords in poems according to ages") |
|
word_cloud(modern, "Word Cloud of Modern Poems") |
|
|
|
word_cloud(renaissance, "Word Cloud Renaissance Poems") |
|
|
|
|
|
st.write("Most appearing words including stopwords") |
|
st.bar_chart(words[0:50]) |
|
|