Merge pull request #14 from luciferreeves/main

Fixed Text Prediction
author: Bobby <[email protected]> 2022-12-03 22:55:09 -0500
committer: GitHub <[email protected]> 2022-12-03 22:55:09 -0500
commit: d2550bdbbdd36550380ea07d8335ade7802d0f29 (patch)
tree: 02cc010a5094f26c6ef06e81b5f38cc22ab34c7f /src/windows
parent: ecf0f94dad8147c4fe90622fbccbc2df355036be (diff)
parent: 432685c4972870a8119996335f53c08838661b80 (diff)
download: RedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.tar.xz
RedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.zip
3 files changed, 55 insertions, 57 deletions
diff --git a/src/windows/modeltrainer.py b/src/windows/modeltrainer.py
index 17908da..2b7bc10 100644
--- a/src/windows/modeltrainer.py
+++ b/src/windows/modeltrainer.py
@@ -8,9 +8,7 @@ import customtkinter
 import nltk
 from nltk.corpus import stopwords
 from nltk.stem import SnowballStemmer
-from scipy.sparse import hstack
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.preprocessing import LabelBinarizer
 import ssl
 import json
 from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
@@ -26,6 +24,7 @@ warnings.filterwarnings('ignore')
 nltk.download('stopwords')
 nltk.download('wordnet')
 from string import punctuation
+import pandas as pd
 
 import numpy as np
 from sklearn.dummy import DummyRegressor
@@ -41,7 +40,6 @@ def preprocess(message):
     stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation)
 
     # Convert message to lower case 
-    message = str(message)
     message = message.lower()
     
     # Remove all the links from the messages 
@@ -102,7 +100,15 @@ class ModelTrainer(customtkinter.CTkToplevel):
         
         self.categorical_features = ['subreddit', 'distinguished', 'hour', 'day']
         self.posts['text'] = self.posts['title'] + ' ' + self.posts['selftext']
+        self.posts['text'] = self.posts['text'].astype(str)
         self.posts['text'] = self.posts['text'].apply(lambda x: preprocess(x))
+        self.posts.drop(['title', 'selftext'], axis=1, inplace=True)
+
+        # convert categorical features
+        for col in self.categorical_features:
+            self.posts[col] = self.posts[col].astype('category')
+
+        self.posts = pd.get_dummies(self.posts, columns=self.categorical_features)
 
         self.text_features = ['text']
         self.title('Reddit Data Analysis - Building Models')
@@ -154,41 +160,25 @@ class ModelTrainer(customtkinter.CTkToplevel):
 
 
     def start(self):
-        self.ups = self.posts['ups']
-        self.num_comments = self.posts['num_comments']
-
-        # select only text, subreddit, link_flair_text, distinguished, hour, day, ups, num_comments
-        self.posts_ups = self.posts[self.categorical_features + self.text_features + ['ups']]
-        self.posts_num_comments = self.posts[self.categorical_features + self.text_features + ['num_comments']]
         self.tfidf = TfidfVectorizer()
-        self.label_binarizer = LabelBinarizer()
+        self.X = self.tfidf.fit_transform(self.posts['text'])
 
         self.edit_textbox('Preparing Data (Upvotes)', 1, 'wait')
-        
-        # generate tfidf - label_binarizer for ups
-        self.tfidf_ups = self.tfidf.fit_transform(self.posts_ups['text'])
-        self.category_ups = [self.label_binarizer.fit_transform(self.posts_ups[col]) for col in self.categorical_features]
-        self.category_ups = np.concatenate(self.category_ups, axis=1)
-        self.X_ups = hstack([self.tfidf_ups, self.category_ups])
-        self.y_ups = self.posts_ups['ups']
+        # dataframes for ups
+        self.ups_df = self.posts.drop(['num_comments'], axis=1)
 
-        # split data into train and test sets
-        self.X_train_ups, self.X_test_ups, self.y_train_ups, self.y_test_ups = train_test_split(self.X_ups, self.y_ups, test_size=0.2, random_state=42)
+        # split data into train and test sets for ups
+        self.X_train_ups, self.X_test_ups, self.y_train_ups, self.y_test_ups = train_test_split(self.X, self.ups_df['ups'], test_size=0.2, random_state=10)
 
         self.edit_textbox('Preparing Data (Upvotes)', 1, 'done')
 
         self.edit_textbox('Preparing Data (Number of Comments)', 3, 'wait')
+        # dataframes for num_comments
+        self.num_comments_df = self.posts.drop(['ups'], axis=1)
 
-        # generate tfidf - label_binarizer for num_comments
-        self.tfidf_num_comments = self.tfidf.fit_transform(self.posts_num_comments['text'])
-        self.category_num_comments = [self.label_binarizer.fit_transform(self.posts_num_comments[col]) for col in self.categorical_features]
-        self.category_num_comments = np.concatenate(self.category_num_comments, axis=1)
-        self.X_num_comments = hstack([self.tfidf_num_comments, self.category_num_comments])
-        self.y_num_comments = self.posts_num_comments['num_comments']
-
-        # split data into train and test sets
-        self.X_train_num_comments, self.X_test_num_comments, self.y_train_num_comments, self.y_test_num_comments = train_test_split(self.X_num_comments, self.y_num_comments, test_size=0.2, random_state=42)
-
+        # split data into train and test sets for num_comments
+        self.X_train_num_comments, self.X_test_num_comments, self.y_train_num_comments, self.y_test_num_comments = train_test_split(self.X, self.num_comments_df['num_comments'], test_size=0.2, random_state=10)
+        
         self.edit_textbox('Preparing Data (Number of Comments)', 2, 'done')
 
         # train models
@@ -248,6 +238,10 @@ class ModelTrainer(customtkinter.CTkToplevel):
             self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'done')
             line_count += 1
 
+        # dump the vectorizer
+        with open(self.model_dir + 'vectorizer.pkl', 'wb') as f:
+            pickle.dump(self.tfidf, f)
+
         # save the metrics
         with open(self.model_dir + 'ups_metrics.json', 'w') as f:
             json.dump(self.ups_dict, f)
diff --git a/src/windows/plotviewer.py b/src/windows/plotviewer.py
index da334b6..635befa 100644
--- a/src/windows/plotviewer.py
+++ b/src/windows/plotviewer.py
@@ -295,7 +295,7 @@ class PlotViewer(customtkinter.CTk):
 
     def predict(self):
         # child window to take input of the post - title, selftext, subreddit, day, hour, distinguished
-        pred_win = Predict(self)
+        pred_win = Predict(self, self.model.get())
         self.wait_window(pred_win)
         
     def train_models(self):
diff --git a/src/windows/predict.py b/src/windows/predict.py
index 374ee09..49402ff 100644
--- a/src/windows/predict.py
+++ b/src/windows/predict.py
@@ -1,24 +1,21 @@
-import random
-import math
 import customtkinter
 import tkinter
 import pandas as pd
 import os 
 import pickle
-from scipy.sparse import hstack
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.preprocessing import LabelBinarizer
 from .modeltrainer import preprocess
 import tkinter.messagebox
+import joblib
 
 class Predict(customtkinter.CTkToplevel):
     model_dir = "models/"
-    def __init__(self, parent):
+    def __init__(self, parent, selected_model):
         super().__init__(parent)
         self.parent = parent
         self.title("Predictions")
         self.grab_set()
         self.focus_set()
+        self.selected_model = selected_model
         posx = int(self.winfo_screenwidth()/2 - 150)
         posy = int(self.winfo_screenheight()/2 - 350)
         self.geometry("300x650+{}+{}".format(posx, posy))
@@ -29,6 +26,9 @@ class Predict(customtkinter.CTkToplevel):
         self.title_label.pack(pady=5, padx=30, fill='x', side=tkinter.TOP, anchor='w')
         self.title_entry = customtkinter.CTkEntry(self, width=240)
         self.title_entry.pack(pady=10, padx=10)
+
+        # uncomment the following lines to add title
+
         self.title_entry.insert(0, 'A good openCV tutorial?')
 
         # selftext
@@ -36,6 +36,9 @@ class Predict(customtkinter.CTkToplevel):
         self.selftext_label.pack(pady=5, padx=30, fill='x', side=tkinter.TOP, anchor='w')
         self.selftext_entry = customtkinter.CTkTextbox(self, width=240, height=100)
         self.selftext_entry.pack(pady=10, padx=10)
+
+        # uncomment the following lines to add selftext
+
         self.selftext_entry.insert("0.0", "So I'm learning openCV in python, and now I want as a project to develop some score calculator for a scrabble game. I watched this tutorial from codecamp, and i read about of functionalities of opencv module (such as medianblur, gaussianblur, addweighted, Canny, threshold, and so on), but i still can't grasp it together. Like, i know how to blur an image, to reduce noise let's say, but i don't know when to do that, and especially, why and how much, so I'm searching for a good openCV tutorial that explains these situations. \n As an example, yesterday I did a project where i would've get a sudoku box from an image(by getting the top left, top right, bottom left, bottom right corners of the sudoku box). However, when I tried the same code for the project with the scrabble board, it's a total mess.")
 
         # subreddit
@@ -82,39 +85,40 @@ class Predict(customtkinter.CTkToplevel):
         day = self.day_entry.get()
         hour = self.hour_entry.get()
         hour = int(hour.split(':')[0])
-        distinguished = self.distinguished_entry.get()
+        distinguished = False if self.distinguished_entry.get() == 0 else True
 
         if not title or not selftext or not subreddit or not day or not hour:
             tkinter.messagebox.showerror('Error', 'Please fill all the fields')
             return
 
         # load the model
-        ups_model = pickle.load(open(os.path.join(self.model_dir, "DummyRegressor_ups.pkl"), 'rb'))
-        num_comments_model = pickle.load(open(os.path.join(self.model_dir, "DummyRegressor_num_comments.pkl"), 'rb'))
+        ups_model = pickle.load(open(os.path.join(self.model_dir, self.selected_model + '_ups.pkl'), 'rb'))
+        num_comments_model = pickle.load(open(os.path.join(self.model_dir, self.selected_model + '_num_comments.pkl'), 'rb'))
+
+        # load the vectorizer
+        vectorizer = joblib.load(os.path.join(self.model_dir, "vectorizer.pkl"))
 
         text = title + " " + selftext
         text = preprocess(text)
 
-        post = pd.DataFrame({
-            'text': [text],
-            'subreddit': [subreddit],
-            'day': [day],
-            'hour': [hour],
-            'distinguished': [distinguished]
-        })
+        input_data = pd.DataFrame(columns=['text', 'day', 'hour', 'subreddit', 'distinguished'])
+        input_data = input_data.append({'text': text, 'day': day, 'hour': hour, 'subreddit': subreddit, 'distinguished': distinguished}, ignore_index=True)
+
+        cat_cols = ['day', 'hour', 'subreddit', 'distinguished']
+        for col in cat_cols:
+            input_data[col] = input_data[col].astype('category')
+        input_data = pd.get_dummies(input_data, columns=cat_cols)
+        input_data['text'] = input_data['text'].astype(str)
 
-        self.tfidf_vectorizer = TfidfVectorizer()
-        self.label_binarizer = LabelBinarizer()
+        X = vectorizer.transform(input_data['text'])
 
-        post_cat = [self.label_binarizer.fit_transform(post[col]) for col in ['subreddit', 'day', 'hour', 'distinguished']]
-        post_text = self.tfidf_vectorizer.fit_transform(post['text'])
-        postX = hstack([post_text] + post_cat).tocsr()
+        # predict the ups
+        ups = ups_model.predict(X)
+        ups = int(ups[0])
 
-        ups = int(ups_model.predict(postX)[0])
-        num_comments = int(num_comments_model.predict(postX)[0])
+        # predict the num_comments
+        num_comments = num_comments_model.predict(X)
+        num_comments = int(num_comments[0])
 
-        # random bias from the prediction
-        ups = int(ups // math.log(ups + 1))
-        num_comments = int(num_comments // math.log(num_comments + 1))
+        tkinter.messagebox.showinfo('Result', 'Predicted ups: {}\nPredicted num_comments: {}'.format(ups, num_comments))
 
-        tkinter.messagebox.showinfo('Predictions', 'Predicted ups: {}\nPredicted num_comments: {}'.format(ups, num_comments))
author	Bobby <[email protected]>	2022-12-03 22:55:09 -0500
committer	GitHub <[email protected]>	2022-12-03 22:55:09 -0500
commit	d2550bdbbdd36550380ea07d8335ade7802d0f29 (patch)
tree	02cc010a5094f26c6ef06e81b5f38cc22ab34c7f /src/windows
parent	ecf0f94dad8147c4fe90622fbccbc2df355036be (diff)
parent	432685c4972870a8119996335f53c08838661b80 (diff)
download	RedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.tar.xz RedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.zip