diff options
Diffstat (limited to 'src/windows')
| -rw-r--r-- | src/windows/data_fetcher.py | 88 | ||||
| -rw-r--r-- | src/windows/modeltrainer.py | 217 | ||||
| -rw-r--r-- | src/windows/plotviewer.py | 266 |
3 files changed, 571 insertions, 0 deletions
diff --git a/src/windows/data_fetcher.py b/src/windows/data_fetcher.py new file mode 100644 index 0000000..57763a1 --- /dev/null +++ b/src/windows/data_fetcher.py @@ -0,0 +1,88 @@ +import time +import tkinter +import tkinter.messagebox +from tkinter import ttk + +import customtkinter +import requests + +from helpers.subreddits import SUBREDDITS + + +class DataDownloader: + posts = [] + after = None + downloaded = 0 + start_time = time.time() + + def __init__(self): + self.root = tkinter.Tk() + self.root.title('Downloading Data - 0%') + + # center the window + posx = int(self.root.winfo_screenwidth() / 2 - 250) + posy = int(self.root.winfo_screenheight() / 2 - 50) + self.root.geometry(f'500x100+{posx}+{posy}') + + self.root.resizable(False, False) + self.root.protocol('WM_DELETE_WINDOW', self.on_closing) + self.subreddits = SUBREDDITS + self.posts_per_subreddit = 100 + + self.progress = ttk.Progressbar(self.root, orient='horizontal', length=500, mode='determinate') + self.progress['value'] = 0 + self.progress['maximum'] = 100 + + self.download_label = tkinter.Label(self.root, text='Downloading: 0 / {} Posts'.format(len(SUBREDDITS) * self.posts_per_subreddit)) + + self.download_label.pack(fill='x', padx=10, pady=10, side= tkinter.TOP, anchor='w') + self.progress.pack(fill='x', padx=10, pady=10) + + self.root.bind('<<DownloadComplete>>', lambda e: self.on_closing()) + client_id = 'dog1LGxsD9M3bXtglOzKsQ' + client_secret = 'nc-HHPBGtz51-_r4vNLGcuCHmT39Lw' + username = 'NoSeason1949' + password = 'Password@1234' + auth = requests.auth.HTTPBasicAuth(client_id, client_secret) + data = { + 'grant_type': 'password', + 'username': username, + 'password': password, + } + headers = {'User-Agent': 'RedditTest/0.1 by {}'.format(username)} + res = requests.post('https://www.reddit.com/api/v1/access_token', + auth=auth, data=data, headers=headers) + self.token = res.json()['access_token'] + self.headers = {**headers, **{'Authorization': f"bearer {self.token}"}} + + def on_closing(self): + self.root.destroy() + + def download(self): + for subreddit in self.subreddits: + subreddit_download = 0 + while subreddit_download < self.posts_per_subreddit: + url = f"https://oauth.reddit.com/r/{subreddit}/hot?limit=100" + if self.after: + url += f"&after={self.after}" + res = requests.get(url, headers=self.headers) + if res.status_code == 200: + data = res.json() + self.after = data['data']['after'] + for post in data['data']['children']: + self.posts.append(post['data']) + subreddit_download += 1 + self.downloaded += 1 + dwval = (self.downloaded / (self.posts_per_subreddit * len(self.subreddits))) * 100 + self.progress['value'] = float(dwval) if dwval < 100 else 100 + self.root.title(f'Downloading Data - {int(self.progress["value"])}%') + time_remaining = (time.time() - self.start_time) / (self.downloaded / (self.posts_per_subreddit * len(self.subreddits))) - (time.time() - self.start_time) + time_remaining = time.strftime('%H:%M:%S', time.gmtime(time_remaining)) + self.download_label['text'] = f'Downloading: {self.downloaded} / {self.posts_per_subreddit * len(self.subreddits)} Posts - {time_remaining} Remaining' if self.downloaded < self.posts_per_subreddit * len(self.subreddits) else 'Download Complete' + self.root.update() + + self.root.event_generate('<<DownloadComplete>>', when='tail') + + def start(self): + self.root.after(0, self.download) + self.root.mainloop() diff --git a/src/windows/modeltrainer.py b/src/windows/modeltrainer.py new file mode 100644 index 0000000..25b49c9 --- /dev/null +++ b/src/windows/modeltrainer.py @@ -0,0 +1,217 @@ +import html +import os +import pickle +import re +import warnings + +import customtkinter +import nltk +from nltk.corpus import stopwords +from nltk.stem import SnowballStemmer +from scipy.sparse import hstack +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.preprocessing import LabelBinarizer + +warnings.filterwarnings('ignore') +nltk.download('stopwords') +nltk.download('wordnet') +from string import punctuation + +import numpy as np +from sklearn.dummy import DummyRegressor +from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor +from sklearn.linear_model import LinearRegression, RidgeCV +from sklearn.model_selection import train_test_split +from sklearn.neighbors import KNeighborsRegressor +from sklearn.tree import DecisionTreeRegressor + + +def preprocess(message): + stemmer = SnowballStemmer('english') + stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation) + + # Convert message to lower case + message = str(message) + message = message.lower() + + # Remove all the links from the messages + message = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ + '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', message) + # Remove all the mentions + message =re.sub("(@[A-Za-z0-9_]+)","", message) + + # Remove all the emojis + message = re.sub(re.compile("[" + u"\U0001F600-\U0001F64F" # emoticons + u"\U0001F300-\U0001F5FF" # symbols & pictographs + u"\U0001F680-\U0001F6FF" # transport & map symbols + u"\U0001F1E0-\U0001F1FF" # flags (iOS) + u"\U00002500-\U00002BEF" # chinese char + u"\U00002702-\U000027B0" + u"\U00002702-\U000027B0" + u"\U000024C2-\U0001F251" + u"\U0001f926-\U0001f937" + u"\U00010000-\U0010ffff" + u"\u2640-\u2642" + u"\u2600-\u2B55" + u"\u200d" + u"\u23cf" + u"\u23e9" + u"\u231a" + u"\ufe0f" # dingbats + u"\u3030" + "]+", flags=re.UNICODE), '', message) + + # Remove HTML entities + message = html.unescape(message) + + # strip blank spaces + message = message.strip() + + # Remove all the punctuations + message = message.translate(str.maketrans('', '', punctuation)) + + # Remove stopwords and perform stemming + message = ' '.join([stemmer.stem(word) for word in message.split() if word not in stuff_to_be_removed]) + + # Return the message + return message + +class ModelTrainer(customtkinter.CTkToplevel): + model_dir = 'models/' + + def __init__(self, parent, posts): + super().__init__(parent) + self.parent = parent + self.posts = posts # posts is already a dataframe + self.features = ['title', 'selftext', 'subreddit', 'distinguished', 'hour', 'day'] + self.targets = ['ups', 'num_comments'] + for col in self.posts.columns: + if col not in self.features + self.targets: + self.posts.drop(col, axis=1, inplace=True) + + self.categorical_features = ['subreddit', 'distinguished', 'hour', 'day'] + self.posts['text'] = self.posts['title'] + ' ' + self.posts['selftext'] + self.posts['text'] = self.posts['text'].apply(lambda x: preprocess(x)) + + self.text_features = ['text'] + self.title('Reddit Data Analysis - Building Models') + posx = int(self.winfo_screenwidth()/2 - 300) + posy = int(self.winfo_screenheight()/2 - 150) + self.geometry('600x300+{}+{}'.format(posx, posy)) + self.resizable(False, False) + self.protocol('WM_DELETE_WINDOW', self.disable_event) + + self.updates = customtkinter.CTkTextbox(self, height=300, width=600, state = 'disabled') + self.updates.pack(fill='both', expand=True) + + # Create a hash table to store the model objects + self.model_hashmap = { + "DummyRegressor": DummyRegressor(), + "LinearRegression": LinearRegression(), + "RidgeCV": RidgeCV(cv=10), + "KNeighborsRegressor": KNeighborsRegressor(), + "DecisionTreeRegressor": DecisionTreeRegressor(min_samples_split=45, min_samples_leaf=45, random_state = 10), + "RandomForestRegressor": RandomForestRegressor(n_jobs=-1, n_estimators=70, min_samples_leaf=10, random_state = 10), + "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=70, max_depth=5) + } + + self.start() + + def disable_event(self): + pass + + def edit_textbox(self, text, line, type='wait'): + emoji = '🕐' if type == 'wait' else '✅' + line_next = line + 1 + line = str(line) + '.0' + line_next = str(line_next) + '.0' + self.updates.configure(state='normal') + if type == 'wait': + self.updates.insert(line, emoji + ' ' + text + '...' + '\n\n') + else: + self.updates.delete(line, line_next) + self.updates.insert(line, emoji + ' ' + text + '\n\n') + self.updates.configure(state='disabled') + + # scroll to line + self.updates.see(line) + + # update the window + self.update() + + + def start(self): + self.ups = self.posts['ups'] + self.num_comments = self.posts['num_comments'] + + # select only text, subreddit, link_flair_text, distinguished, hour, day, ups, num_comments + self.posts_ups = self.posts[self.categorical_features + self.text_features + ['ups']] + self.posts_num_comments = self.posts[self.categorical_features + self.text_features + ['num_comments']] + self.tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english') + self.label_binarizer = LabelBinarizer() + + self.edit_textbox('Preparing Data (Upvotes)', 1, 'wait') + + # generate tfidf - label_binarizer for ups + self.tfidf_ups = self.tfidf.fit_transform(self.posts_ups['text']) + self.category_ups = [self.label_binarizer.fit_transform(self.posts_ups[col]) for col in self.categorical_features] + self.category_ups = np.concatenate(self.category_ups, axis=1) + self.X_ups = hstack([self.tfidf_ups, self.category_ups]) + self.y_ups = self.posts_ups['ups'] + + # split data into train and test sets + self.X_train_ups, self.X_test_ups, self.y_train_ups, self.y_test_ups = train_test_split(self.X_ups, self.y_ups, test_size=0.2, random_state=42) + + self.edit_textbox('Preparing Data (Upvotes)', 1, 'done') + + self.edit_textbox('Preparing Data (Number of Comments)', 3, 'wait') + + # generate tfidf - label_binarizer for num_comments + self.tfidf_num_comments = self.tfidf.fit_transform(self.posts_num_comments['text']) + self.category_num_comments = [self.label_binarizer.fit_transform(self.posts_num_comments[col]) for col in self.categorical_features] + self.category_num_comments = np.concatenate(self.category_num_comments, axis=1) + self.X_num_comments = hstack([self.tfidf_num_comments, self.category_num_comments]) + self.y_num_comments = self.posts_num_comments['num_comments'] + + # split data into train and test sets + self.X_train_num_comments, self.X_test_num_comments, self.y_train_num_comments, self.y_test_num_comments = train_test_split(self.X_num_comments, self.y_num_comments, test_size=0.2, random_state=42) + + self.edit_textbox('Preparing Data (Number of Comments)', 2, 'done') + + # train models + self.train_models() + + # Create a function to save the models + def save_model(self, model, model_name): + """ + Saves the model to the models/ directory + """ + if not os.path.exists(self.model_dir): + os.mkdir(self.model_dir) + with open(self.model_dir + model_name + '.pkl', 'wb') as f: + pickle.dump(model, f) + + + def train_models(self): + line_count = 3 + for model_name, model in self.model_hashmap.items(): + self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'wait') + model.fit(self.X_train_ups, self.y_train_ups) + self.save_model(model, model_name + '_ups') + self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'done') + line_count += 1 + + self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'wait') + model.fit(self.X_train_num_comments, self.y_train_num_comments) + self.save_model(model, model_name + '_num_comments') + self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'done') + line_count += 1 + + self.edit_textbox('Training Complete. Models saved to models/ directory. You may now close this window.', line_count, 'done') + + # allow user to close window + self.protocol("WM_DELETE_WINDOW", self.enable_close) + + def enable_close(self): + self.destroy() diff --git a/src/windows/plotviewer.py b/src/windows/plotviewer.py new file mode 100644 index 0000000..da3886e --- /dev/null +++ b/src/windows/plotviewer.py @@ -0,0 +1,266 @@ +import datetime +import os + +import tkinter +from tkinter import ttk + +import customtkinter +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg +from matplotlib.figure import Figure + +from helpers.subreddits import SUBREDDITS +from .modeltrainer import ModelTrainer +import numpy as np + + +def pretty_number(number): + # Convert number to in B, M, K format + if number >= 1000000000: + return '{:.2f} B'.format(number / 1000000000) + elif number >= 1000000: + return '{:.2f} M'.format(number / 1000000) + elif number >= 1000: + return '{:.2f} K'.format(number / 1000) + else: + return number + +# Author Scoring Function +def author_scores(df): + df_author = df[['author', 'score', 'subreddit', 'num_comments', 'upvote_ratio']] + df_author['final_score'] = df_author['score'] * df_author['upvote_ratio'] + df_author['num_comments'] + df_author = df_author.groupby(['author', 'subreddit']).sum() + df_author = df_author.reset_index() + return df_author + +# Plot Viewer Window +class PlotViewer(customtkinter.CTk): + def __init__(self, posts): + super().__init__() + self.title('Reddit Data Analysis - Plot Viewer') + posx = int(self.winfo_screenwidth() / 2 - 600) + posy = int(self.winfo_screenheight() / 2 - 400) + self.geometry(f'1200x800+{posx}+{posy}') + self.posts = posts + self.create_tabs() + + def create_tabs(self): + self.tabview = customtkinter.CTkTabview(self) + self.tabview.add("Posts") + self.tabview.add("Subscribers") + self.tabview.add("Author Activity") + self.tabview.add("Multi-Subreddit Analysis") + self.tabview.add("Posts per Day") + self.tabview.add("Top 10 Authors") + self.tabview.add("Best Time Analysis") + self.tabview.add("Scores Boxplot") + self.tabview.add("Scores vs Comments") + self.tabview.add("View Data / Predictions") + + fig = Figure(figsize=(12, 8), dpi=72) + self.posts_plot = fig.add_subplot(111) + self.posts_plot.set_title('Number of posts per subreddit') + self.posts_plot.set_xlabel('Subreddit') + self.posts_plot.set_xticklabels(np.arange(len(SUBREDDITS)), rotation=45) + self.posts_plot.set_ylabel('Number of posts') + sns.countplot(x='subreddit', data=self.posts, ax=self.posts_plot) + for p in self.posts_plot.patches: + self.posts_plot.annotate('{:1.0f} posts'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), + ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points') + self.posts_plot.figure.tight_layout() + self.posts_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Posts")) + self.posts_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + fig = Figure(figsize=(12, 8), dpi=72) + self.subscribers_plot = fig.add_subplot(111) + self.subscribers_plot.set_title('Number of subscribers per subreddit') + self.subscribers_plot.set_xlabel('Subreddit') + self.subscribers_plot.set_xticklabels(np.arange(len(SUBREDDITS)), rotation=45) + self.subscribers_plot.set_ylabel('Number of subscribers') + sns.barplot(x='subreddit', y='subreddit_subscribers', data=self.posts, ax=self.subscribers_plot) + for p in self.subscribers_plot.patches: + self.subscribers_plot.annotate('{}'.format(pretty_number(p.get_height())), (p.get_x() + p.get_width() / 2., p.get_height()), + ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points') + self.subscribers_plot.figure.tight_layout() + self.subscribers_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Subscribers")) + self.subscribers_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + fig = Figure(figsize=(12, 8), dpi=72) + self.author_activity_plot = fig.add_subplot(111) + self.author_activity_plot.set_title('Authors Posting in multiple Subreddits') + n_subreddits = self.posts.groupby('author')['subreddit'].nunique() + sns.countplot(x=n_subreddits, palette=sns.color_palette("husl"), ax=self.author_activity_plot) + for p in self.author_activity_plot.patches: + self.author_activity_plot.annotate('{:1.0f} authors'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), + ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points') + self.author_activity_plot.set_xlabel('Number of Subreddits') + self.author_activity_plot.set_ylabel('Number of Authors') + self.author_activity_plot.figure.tight_layout() + self.author_activity_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Author Activity")) + self.author_activity_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + fig = Figure(figsize=(12, 8), dpi=72) + n_upvotes = self.posts.groupby('author')['ups'].sum() + self.multi_subreddit_plot = fig.add_subplot(111) + self.multi_subreddit_plot.set_title('Does posting in multiple subreddits drives more upvotes?') + sns.barplot(x=n_subreddits, y=n_upvotes, palette=sns.color_palette("pastel"), ax=self.multi_subreddit_plot) + for p in self.multi_subreddit_plot.patches: + self.multi_subreddit_plot.annotate('{:1.0f} upvotes'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), + ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points') + self.multi_subreddit_plot.set_xlabel('Number of Subreddits') + self.multi_subreddit_plot.set_ylabel('Number of Upvotes') + self.multi_subreddit_plot.set_xticks(list(range(0, len(n_subreddits.unique()))), list(map(lambda x: '{} Subreddits'.format(x) if x > 1 else '{} Subreddits'.format(x), list(range(1, len(n_subreddits.unique()) + 1))))) + self.multi_subreddit_plot.figure.tight_layout() + self.multi_subreddit_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Multi-Subreddit Analysis")) + self.multi_subreddit_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + + ppd_df = self.posts.groupby(['subreddit', 'created_utc']).size().reset_index(name='counts') + ppd_df['created_utc'] = pd.to_datetime(ppd_df['created_utc']).dt.date + ppd_df = ppd_df.groupby(['subreddit', 'created_utc']).sum().reset_index() + ppd_df = ppd_df.pivot(index='created_utc', columns='subreddit', values='counts') + ppd_df = ppd_df.fillna(0) + last_6M = datetime.date.today() - datetime.timedelta(days=180) + ppd_df = ppd_df.loc[ppd_df.index >= last_6M] + palette = sns.color_palette("dark6", len(SUBREDDITS)) + + + fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24) + fig.suptitle('Number of posts per day per subreddit (Last 6 Months)\n', fontsize=16) + fig.subplots_adjust(hspace=0.5, wspace=0.5) + for i, subreddit in enumerate(SUBREDDITS): + ax = axes[i // 3, i % 3] + ax.set_title(subreddit) + ax.set_xlabel('Date') + ax.set_ylabel('Number of Posts') + ax.set_xticklabels(ppd_df.index, rotation=0) + sns.lineplot(data=ppd_df[subreddit], ax=ax, color=palette[i]) + self.ppd_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Posts per Day")) + self.ppd_plot.figure.tight_layout() + self.ppd_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + top_10_authors_per_subreddit = author_scores(self.posts).groupby('subreddit').apply(lambda x: x.nlargest(10, 'final_score')) + top_10_authors_per_subreddit = top_10_authors_per_subreddit.reset_index(drop=True) + fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24) + fig.suptitle('Top 10 Authors per Subreddit\n', fontsize=16) + fig.subplots_adjust(hspace=0.5, wspace=0.5) + for i, subreddit in enumerate(SUBREDDITS): + ax = axes[i // 3, i % 3] + ax.set_title(subreddit) + ax.set_xticklabels(axes[i//3, i%3].get_xticklabels(), rotation=30, horizontalalignment='right') + sns.barplot(x='author', y='final_score', data=top_10_authors_per_subreddit[top_10_authors_per_subreddit['subreddit'] == subreddit], ax=ax, palette=sns.color_palette("pastel", 10)) + ax.set_ylabel('Final Score') + ax.set_xlabel('') + for p in axes[i//3, i%3].patches: + axes[i//3, i%3].annotate('{:1.0f}'.format(p.get_height()), (p.get_x() + p.get_width() / 2., p.get_height()), + ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points') + self.top_10_authors_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Top 10 Authors")) + self.top_10_authors_plot.figure.tight_layout() + self.top_10_authors_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + # Finding the best time to post on each subreddit + best_time_df = self.posts[['subreddit', 'created_utc', 'score', 'num_comments']] + best_time_df['final_score'] = best_time_df['score'] + best_time_df['num_comments'] + best_time_df.drop(['score', 'num_comments'], axis=1, inplace=True) + + # Convert the created_utc column to datetime + best_time_df['created_utc'] = pd.to_datetime(best_time_df['created_utc']) + best_time_df['day'] = best_time_df['created_utc'].dt.day_name() + best_time_df['hour'] = best_time_df['created_utc'].dt.hour + best_time_df.drop('created_utc', axis=1, inplace=True) + + # Find total engagement per hour + best_time_df = best_time_df.groupby(['subreddit', 'day', 'hour']).sum() + best_time_df = best_time_df.reset_index() + days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'] + best_time_df['day'] = pd.Categorical(best_time_df['day'], categories=days, ordered=True) + + # Plotting the best time to post on each subreddit + fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24) + fig.suptitle('Best Time to Post on Each Subreddit\n', fontsize=20) + for i, subreddit in enumerate(best_time_df['subreddit'].unique()): + sns.lineplot(x='hour', y='final_score', hue='day', data=best_time_df[best_time_df['subreddit'] == subreddit], ax=axes[i//3, i%3], palette=sns.color_palette("husl", 7)) + axes[i//3, i%3].set_title(subreddit) + axes[i//3, i%3].set_xticks(range(0, 24)) + axes[i//3, i%3].set_xticklabels(list(map(lambda x: (f'0{x}:00' if x < 10 else f'{x}:00'), list(range(0, 24)))), rotation=45, horizontalalignment='right') + axes[i//3, i%3].set_xlabel('Time of Day') + axes[i//3, i%3].set_ylabel('Total Engagement') + self.best_time_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Best Time Analysis")) + self.best_time_plot.figure.tight_layout() + self.best_time_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + fig = Figure(figsize=(12, 8), dpi=72) + self.scores_boxplot = fig.add_subplot(111) + sns.boxplot(x='subreddit', y='score', data=self.posts, ax=self.scores_boxplot) + self.scores_boxplot.set_title('Boxplot of Scores in Each Subreddit') + self.scores_boxplot.set_xlabel('Subreddit') + self.scores_boxplot.set_ylabel('Score') + self.scores_boxplot = FigureCanvasTkAgg(fig, self.tabview.tab("Scores Boxplot")) + self.scores_boxplot.figure.tight_layout() + self.scores_boxplot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + # Scatterplot of the scores and number of comments in each subreddit + fig, axes = plt.subplots(5, 3, figsize=(20, 20), dpi=24) + fig.suptitle('Scatterplot of Scores and Number of Comments in Each Subreddit\n', fontsize=20) + palette=sns.color_palette("deep", 15) + for i, subreddit in enumerate(self.posts['subreddit'].unique()): + sns.scatterplot(x='score', y='num_comments', data=self.posts[self.posts['subreddit'] == subreddit], ax=axes[i//3, i%3], color=palette[i]) + axes[i//3, i%3].set_title(subreddit) + axes[i//3, i%3].set_xlabel('Score') + axes[i//3, i%3].set_ylabel('Number of Comments') + self.scores_comments_plot = FigureCanvasTkAgg(fig, self.tabview.tab("Scores vs Comments")) + self.scores_comments_plot.figure.tight_layout() + self.scores_comments_plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + + # View Data / Predictions tab + # show the posts dataframe in a table + self.posts_table = ttk.Treeview(self.tabview.tab("View Data / Predictions")) + self.posts_table.pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + self.posts_table['columns'] = list(self.posts.columns) + for column in self.posts_table['columns']: + self.posts_table.column(column, anchor='w') + self.posts_table.heading(column, text=column, anchor='w') + + # hide the first column (index) + self.posts_table.column('#0', width=0, stretch=tkinter.NO) + + for i, row in self.posts.iterrows(): + if i < 100: + self.posts_table.insert('', 'end', values=list(row)) + + if not os.path.exists('models') or len(os.listdir('models')) == 0: + try: + os.mkdir('models') + except: + pass + self.models_label = customtkinter.CTkLabel(self.tabview.tab("View Data / Predictions"), text="No models found. Please train the models first.", pady= 10) + self.models_label.pack() + self.models_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Train Models", command=self.train_models) + self.models_button.pack() + else: + self.models_label = customtkinter.CTkLabel(self.tabview.tab("View Data / Predictions"), text="Models found. Predict by entering data on the next screen.", pady= 10) + self.models_label.pack() + self.models_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Predict") + self.models_button.pack() + + self.tabview.pack(expand=True, fill='both') + + def train_models(self): + # open model training child window + mt = ModelTrainer(self, self.posts) + mt.grab_set() + mt.focus_set() + self.wait_window(mt) + + self.models_label.destroy() + self.models_button.destroy() + self.models_label = customtkinter.CTkLabel(self.tabview.tab("View Data / Predictions"), text="Models found. Predict by entering data on the next screen.", pady= 10) + self.models_label.pack() + self.models_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Predict") + self.models_button.pack() + + |
