From c4f7f3bd974a381715f55b1c6681808f1b68bb70 Mon Sep 17 00:00:00 2001 From: Bobby Date: Fri, 2 Dec 2022 20:26:43 -0500 Subject: final changes --- src/windows/data_fetcher.py | 4 +- src/windows/modelplots.py | 171 ++++++++++++++++++++++++++++++++++++++++++++ src/windows/modeltrainer.py | 39 ++++++++++ src/windows/plotviewer.py | 33 +++++++-- 4 files changed, 238 insertions(+), 9 deletions(-) create mode 100644 src/windows/modelplots.py (limited to 'src') diff --git a/src/windows/data_fetcher.py b/src/windows/data_fetcher.py index 57763a1..212c1d7 100644 --- a/src/windows/data_fetcher.py +++ b/src/windows/data_fetcher.py @@ -27,7 +27,7 @@ class DataDownloader: self.root.resizable(False, False) self.root.protocol('WM_DELETE_WINDOW', self.on_closing) self.subreddits = SUBREDDITS - self.posts_per_subreddit = 100 + self.posts_per_subreddit = 1000 self.progress = ttk.Progressbar(self.root, orient='horizontal', length=500, mode='determinate') self.progress['value'] = 0 @@ -56,7 +56,7 @@ class DataDownloader: self.headers = {**headers, **{'Authorization': f"bearer {self.token}"}} def on_closing(self): - self.root.destroy() + pass def download(self): for subreddit in self.subreddits: diff --git a/src/windows/modelplots.py b/src/windows/modelplots.py new file mode 100644 index 0000000..ae895ac --- /dev/null +++ b/src/windows/modelplots.py @@ -0,0 +1,171 @@ +import customtkinter +import seaborn as sns +from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg +import matplotlib.pyplot as plt +from matplotlib.figure import Figure +import json +import tkinter +import numpy as np + +class ModelPlots(customtkinter.CTkToplevel): + def __init__(self, parent): + super().__init__(parent) + self.parent = parent + self.title("Model Plots") + posx = int(self.winfo_screenwidth()/2 - 400) + posy = int(self.winfo_screenheight()/2 - 300) + self.geometry("800x600+{}+{}".format(posx, posy)) + self.grab_set() + self.focus_set() + self.resizable(True, True) + with open(f'models/ups_metrics.json', 'r') as f: + self.ups_metrics = json.load(f) + with open(f'models/num_comments_metrics.json', 'r') as f: + self.num_comments_metrics = json.load(f) + + self.create_widgets() + + def create_widgets(self): + self.tabview = customtkinter.CTkTabview(self) + self.tabview.add('R-Squares') + self.tabview.add('MAE') + self.tabview.add('MSE') + self.tabview.add('R-Square Comparison') + self.tabview.add('Predictions') + self.tabview.add('Residuals') + + # R-Squares + fig, ax = plt.subplots(1, 2, figsize=(15, 3), dpi=36) + ups_m = {} + for k, v in self.ups_metrics.items(): + ups_m[k] = v['r2'] + sns.barplot(x=list(ups_m.keys()), y=list(ups_m.values()), ax=ax[0], palette='Blues_d') + ax[0].set_title('R-Square for Ups') + + num_comments_m = {} + for k, v in self.num_comments_metrics.items(): + num_comments_m[k] = v['r2'] + sns.barplot(x=list(num_comments_m.keys()), y=list(num_comments_m.values()), ax=ax[1], palette='Greens_d') + ax[1].set_title('R-Square for Number of Comments') + + for i in range(2): + ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=45) + self.r2plot = FigureCanvasTkAgg(fig, self.tabview.tab('R-Squares')) + self.r2plot.figure.tight_layout() + self.r2plot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + # MAE + fig, ax = plt.subplots(1, 2, figsize=(15, 3), dpi=36) + ups_m = {} + for k, v in self.ups_metrics.items(): + ups_m[k] = v['mae'] + sns.barplot(x=list(ups_m.keys()), y=list(ups_m.values()), ax=ax[0], palette='Reds_d') + ax[0].set_title('MAE for Ups') + + num_comments_m = {} + for k, v in self.num_comments_metrics.items(): + num_comments_m[k] = v['mae'] + sns.barplot(x=list(num_comments_m.keys()), y=list(num_comments_m.values()), ax=ax[1], palette='Oranges_d') + ax[1].set_title('MAE for Number of Comments') + + for i in range(2): + ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=45) + self.maeplot = FigureCanvasTkAgg(fig, self.tabview.tab('MAE')) + self.maeplot.figure.tight_layout() + self.maeplot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + # MSE + fig, ax = plt.subplots(1, 2, figsize=(15, 3), dpi=36) + ups_m = {} + for k, v in self.ups_metrics.items(): + ups_m[k] = v['mse'] + sns.barplot(x=list(ups_m.keys()), y=list(ups_m.values()), ax=ax[0], palette='Purples_d') + ax[0].set_title('MSE for Ups') + + num_comments_m = {} + for k, v in self.num_comments_metrics.items(): + num_comments_m[k] = v['mse'] + sns.barplot(x=list(num_comments_m.keys()), y=list(num_comments_m.values()), ax=ax[1], palette='Greys_d') + ax[1].set_title('MSE for Number of Comments') + + for i in range(2): + ax[i].set_xticklabels(ax[i].get_xticklabels(), rotation=45) + self.mseplot = FigureCanvasTkAgg(fig, self.tabview.tab('MSE')) + self.mseplot.figure.tight_layout() + self.mseplot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + # R-Square Comparison + fig, ax = plt.subplots(1, 2, figsize=(20, 5), dpi=36) + # Ups + ax[0].set_title('Ups') + ax[0].set_xlabel('Model') + ax[0].set_ylabel('R2 Score') + sns.barplot(x=list(self.ups_metrics.keys()), y=[r2['r2'] for r2 in self.ups_metrics.values()], ax=ax[0], palette='Blues_d') + ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=45) + # Number of Comments + ax[1].set_title('Number of Comments') + ax[1].set_xlabel('Model') + ax[1].set_ylabel('R2 Score') + sns.barplot(x=list(self.num_comments_metrics.keys()), y=[r2['r2'] for r2 in self.num_comments_metrics.values()], ax=ax[1], palette='Greens_d') + ax[1].set_xticklabels(ax[1].get_xticklabels(), rotation=45) + self.r2comparisonplot = FigureCanvasTkAgg(fig, self.tabview.tab('R-Square Comparison')) + self.r2comparisonplot.figure.tight_layout() + self.r2comparisonplot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + # Predictions + fig, ax = plt.subplots(7, 2, figsize=(12, 16), dpi=24) + for i, (k, v) in enumerate(self.ups_metrics.items()): + # ups + ax[i, 0].set_title('Ups - {}'.format(k)) + ax[i, 0].set_xlabel('Actual') + ax[i, 0].set_ylabel('Predicted') + sns.regplot(x=v['actual'], y=v['pred'], ax=ax[i, 0], color='blue', scatter_kws={'alpha': 0.3}) + + # num_comments + ax[i, 1].set_title('Number of Comments - {}'.format(k)) + ax[i, 1].set_xlabel('Actual') + ax[i, 1].set_ylabel('Predicted') + sns.regplot(x=self.num_comments_metrics[k]['actual'], y=self.num_comments_metrics[k]['pred'], ax=ax[i, 1], color='green', scatter_kws={'alpha': 0.3}) + self.predplot = FigureCanvasTkAgg(fig, self.tabview.tab('Predictions')) + self.predplot.figure.tight_layout() + self.predplot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + # Residuals + fig, ax = plt.subplots(7, 6, figsize=(20, 16), dpi=16) + for i, (k, v) in enumerate(self.ups_metrics.items()): + # ups model + ax[i, 0].set_title(k + ' - Ups Residuals') + ax[i, 0].set_xlabel('Residuals') + ax[i, 0].set_ylabel('Frequency') + sns.distplot(np.array(v['actual']) - np.array(v['pred']), ax=ax[i, 0], color='blue', kde=False) + + ax[i, 1].set_title(k + ' Ups Test Scores') + ax[i, 1].set_xlabel('Ups') + ax[i, 1].set_ylabel('Frequency') + sns.distplot(v['actual'], ax=ax[i, 1], color='blue', kde=False) + + ax[i, 2].set_title(k + ' Ups Predicted Scores') + ax[i, 2].set_xlabel('Ups') + ax[i, 2].set_ylabel('Frequency') + sns.distplot(v['pred'], ax=ax[i, 2], kde=False, color='red') + + # num_comments model + ax[i, 3].set_title(k + ' - Number of Comments Residuals') + ax[i, 3].set_xlabel('Residuals') + ax[i, 3].set_ylabel('Frequency') + sns.distplot(np.array(self.num_comments_metrics[k]['actual']) - np.array(self.num_comments_metrics[k]['pred']), ax=ax[i, 3], color='green', kde=False) + + ax[i, 4].set_title(k + ' Number of Comments Test Scores') + ax[i, 4].set_xlabel('Number of Comments') + ax[i, 4].set_ylabel('Frequency') + sns.distplot(self.num_comments_metrics[k]['actual'], ax=ax[i, 4], kde=False, color='green') + + ax[i, 5].set_title(k + ' Number of Comments Predicted Scores') + ax[i, 5].set_xlabel('Number of Comments') + ax[i, 5].set_ylabel('Frequency') + sns.distplot(self.num_comments_metrics[k]['pred'], ax=ax[i, 5], kde=False, color='red') + self.residualplot = FigureCanvasTkAgg(fig, self.tabview.tab('Residuals')) + self.residualplot.figure.tight_layout() + self.residualplot.get_tk_widget().pack(side=tkinter.TOP, fill=tkinter.BOTH, expand=1) + + self.tabview.pack(fill='both', expand=True) diff --git a/src/windows/modeltrainer.py b/src/windows/modeltrainer.py index 6664b57..17908da 100644 --- a/src/windows/modeltrainer.py +++ b/src/windows/modeltrainer.py @@ -12,6 +12,8 @@ from scipy.sparse import hstack from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.preprocessing import LabelBinarizer import ssl +import json +from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error try: _create_unverified_https_context = ssl._create_unverified_context @@ -123,6 +125,8 @@ class ModelTrainer(customtkinter.CTkToplevel): "RandomForestRegressor": RandomForestRegressor(n_jobs=-1, n_estimators=70, min_samples_leaf=10, random_state = 10), "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=70, max_depth=5) } + self.ups_dict = {} + self.num_comments_dict = {} self.start() @@ -207,15 +211,50 @@ class ModelTrainer(customtkinter.CTkToplevel): self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'wait') model.fit(self.X_train_ups, self.y_train_ups) self.save_model(model, model_name + '_ups') + + ups_y_pred = model.predict(self.X_test_ups) + ups_mse = mean_squared_error(self.y_test_ups, ups_y_pred) + ups_mae = mean_absolute_error(self.y_test_ups, ups_y_pred) + ups_r2 = r2_score(self.y_test_ups, ups_y_pred) + self.ups_dict[model_name] = { + 'mse': ups_mse, + 'mae': ups_mae, + 'r2': ups_r2, + 'rmse': np.sqrt(ups_mse), + 'pred': list(ups_y_pred), + 'actual': list(self.y_test_ups) + } + self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'done') line_count += 1 self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'wait') model.fit(self.X_train_num_comments, self.y_train_num_comments) self.save_model(model, model_name + '_num_comments') + + num_comments_y_pred = model.predict(self.X_test_num_comments) + num_comments_mse = mean_squared_error(self.y_test_num_comments, num_comments_y_pred) + num_comments_mae = mean_absolute_error(self.y_test_num_comments, num_comments_y_pred) + num_comments_r2 = r2_score(self.y_test_num_comments, num_comments_y_pred) + self.num_comments_dict[model_name] = { + 'mse': num_comments_mse, + 'mae': num_comments_mae, + 'r2': num_comments_r2, + 'rmse': np.sqrt(num_comments_mse), + 'pred': list(num_comments_y_pred), + 'actual': list(self.y_test_num_comments) + } + self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'done') line_count += 1 + # save the metrics + with open(self.model_dir + 'ups_metrics.json', 'w') as f: + json.dump(self.ups_dict, f) + + with open(self.model_dir + 'num_comments_metrics.json', 'w') as f: + json.dump(self.num_comments_dict, f) + self.edit_textbox('Training Complete. Models saved to models/ directory. You may now close this window.', line_count, 'done') # allow user to close window diff --git a/src/windows/plotviewer.py b/src/windows/plotviewer.py index 40af100..da334b6 100644 --- a/src/windows/plotviewer.py +++ b/src/windows/plotviewer.py @@ -15,6 +15,7 @@ from matplotlib.figure import Figure from helpers.subreddits import SUBREDDITS from .modeltrainer import ModelTrainer from .predict import Predict +from .modelplots import ModelPlots import numpy as np @@ -257,32 +258,46 @@ class PlotViewer(customtkinter.CTk): 'RandomForestRegressor', 'GradientBoostingRegressor', ] - self.model = customtkinter.CTkComboBox(self.tabview.tab("View Data / Predictions"), values=models) + self.model = customtkinter.CTkOptionMenu(self.tabview.tab("View Data / Predictions"), values=models) self.model.pack(pady=10, padx=10, side=tkinter.LEFT) + self.model.set('DummyRegressor') # metrics buttons - self.ups_metrics_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Ups Metrics") + self.ups_metrics_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Ups Metrics", command=self.ups_metrics) self.ups_metrics_button.pack(pady=10, padx=10, side=tkinter.LEFT) - self.num_comments_metrics_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Num Comments Metrics") + self.num_comments_metrics_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Num Comments Metrics", command=self.num_comments_metrics) self.num_comments_metrics_button.pack(pady=10, padx=10, side=tkinter.LEFT) # button for model plots - self.model_plots_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Model Plots") + self.model_plots_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Model Plots", command=self.show_model_plots) self.model_plots_button.pack(pady=10, padx=10, side=tkinter.RIGHT) # button for predicting self.predict_button = customtkinter.CTkButton(self.tabview.tab("View Data / Predictions"), text="Predict a new post", command=self.predict) self.predict_button.pack(pady=10, padx=10, side=tkinter.RIGHT) + def ups_metrics(self): + import json + selected_model = self.model.get() + with open(f'models/ups_metrics.json', 'r') as f: + metrics = json.load(f) + metrics = metrics[selected_model] + tkinter.messagebox.showinfo("Ups Metrics for {selected_model}", f"Mean Absolute Error: {metrics['mae']}" + "\n\n" + f"Mean Squared Error: {metrics['mse']}" + "\n\n" + f"Root Mean Squared Error: {metrics['rmse']}" + "\n\n" + f"R2 Score: {metrics['r2']}") + + def num_comments_metrics(self): + import json + selected_model = self.model.get() + with open(f'models/num_comments_metrics.json', 'r') as f: + metrics = json.load(f) + metrics = metrics[selected_model] + tkinter.messagebox.showinfo("Num Comments Metrics for {selected_model}", f"Mean Absolute Error: {metrics['mae']}" + "\n\n" + f"Mean Squared Error: {metrics['mse']}" + "\n\n" + f"Root Mean Squared Error: {metrics['rmse']}" + "\n\n" + f"R2 Score: {metrics['r2']}") def predict(self): # child window to take input of the post - title, selftext, subreddit, day, hour, distinguished pred_win = Predict(self) self.wait_window(pred_win) - - def train_models(self): # open model training child window mt = ModelTrainer(self, self.posts) @@ -294,4 +309,8 @@ class PlotViewer(customtkinter.CTk): self.models_button.destroy() self.show_model_options() - + def show_model_plots(self): + pw = ModelPlots(self) + pw.grab_set() + pw.focus_set() + self.wait_window(pw) -- cgit v1.2.3