diff options
| author | Bobby <[email protected]> | 2022-12-03 22:55:09 -0500 |
|---|---|---|
| committer | GitHub <[email protected]> | 2022-12-03 22:55:09 -0500 |
| commit | d2550bdbbdd36550380ea07d8335ade7802d0f29 (patch) | |
| tree | 02cc010a5094f26c6ef06e81b5f38cc22ab34c7f /src/windows | |
| parent | ecf0f94dad8147c4fe90622fbccbc2df355036be (diff) | |
| parent | 432685c4972870a8119996335f53c08838661b80 (diff) | |
| download | RedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.tar.xz RedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.zip | |
Merge pull request #14 from luciferreeves/main
Fixed Text Prediction
Diffstat (limited to 'src/windows')
| -rw-r--r-- | src/windows/modeltrainer.py | 52 | ||||
| -rw-r--r-- | src/windows/plotviewer.py | 2 | ||||
| -rw-r--r-- | src/windows/predict.py | 58 |
3 files changed, 55 insertions, 57 deletions
diff --git a/src/windows/modeltrainer.py b/src/windows/modeltrainer.py index 17908da..2b7bc10 100644 --- a/src/windows/modeltrainer.py +++ b/src/windows/modeltrainer.py @@ -8,9 +8,7 @@ import customtkinter import nltk from nltk.corpus import stopwords from nltk.stem import SnowballStemmer -from scipy.sparse import hstack from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.preprocessing import LabelBinarizer import ssl import json from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error @@ -26,6 +24,7 @@ warnings.filterwarnings('ignore') nltk.download('stopwords') nltk.download('wordnet') from string import punctuation +import pandas as pd import numpy as np from sklearn.dummy import DummyRegressor @@ -41,7 +40,6 @@ def preprocess(message): stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation) # Convert message to lower case - message = str(message) message = message.lower() # Remove all the links from the messages @@ -102,7 +100,15 @@ class ModelTrainer(customtkinter.CTkToplevel): self.categorical_features = ['subreddit', 'distinguished', 'hour', 'day'] self.posts['text'] = self.posts['title'] + ' ' + self.posts['selftext'] + self.posts['text'] = self.posts['text'].astype(str) self.posts['text'] = self.posts['text'].apply(lambda x: preprocess(x)) + self.posts.drop(['title', 'selftext'], axis=1, inplace=True) + + # convert categorical features + for col in self.categorical_features: + self.posts[col] = self.posts[col].astype('category') + + self.posts = pd.get_dummies(self.posts, columns=self.categorical_features) self.text_features = ['text'] self.title('Reddit Data Analysis - Building Models') @@ -154,41 +160,25 @@ class ModelTrainer(customtkinter.CTkToplevel): def start(self): - self.ups = self.posts['ups'] - self.num_comments = self.posts['num_comments'] - - # select only text, subreddit, link_flair_text, distinguished, hour, day, ups, num_comments - self.posts_ups = self.posts[self.categorical_features + self.text_features + ['ups']] - self.posts_num_comments = self.posts[self.categorical_features + self.text_features + ['num_comments']] self.tfidf = TfidfVectorizer() - self.label_binarizer = LabelBinarizer() + self.X = self.tfidf.fit_transform(self.posts['text']) self.edit_textbox('Preparing Data (Upvotes)', 1, 'wait') - - # generate tfidf - label_binarizer for ups - self.tfidf_ups = self.tfidf.fit_transform(self.posts_ups['text']) - self.category_ups = [self.label_binarizer.fit_transform(self.posts_ups[col]) for col in self.categorical_features] - self.category_ups = np.concatenate(self.category_ups, axis=1) - self.X_ups = hstack([self.tfidf_ups, self.category_ups]) - self.y_ups = self.posts_ups['ups'] + # dataframes for ups + self.ups_df = self.posts.drop(['num_comments'], axis=1) - # split data into train and test sets - self.X_train_ups, self.X_test_ups, self.y_train_ups, self.y_test_ups = train_test_split(self.X_ups, self.y_ups, test_size=0.2, random_state=42) + # split data into train and test sets for ups + self.X_train_ups, self.X_test_ups, self.y_train_ups, self.y_test_ups = train_test_split(self.X, self.ups_df['ups'], test_size=0.2, random_state=10) self.edit_textbox('Preparing Data (Upvotes)', 1, 'done') self.edit_textbox('Preparing Data (Number of Comments)', 3, 'wait') + # dataframes for num_comments + self.num_comments_df = self.posts.drop(['ups'], axis=1) - # generate tfidf - label_binarizer for num_comments - self.tfidf_num_comments = self.tfidf.fit_transform(self.posts_num_comments['text']) - self.category_num_comments = [self.label_binarizer.fit_transform(self.posts_num_comments[col]) for col in self.categorical_features] - self.category_num_comments = np.concatenate(self.category_num_comments, axis=1) - self.X_num_comments = hstack([self.tfidf_num_comments, self.category_num_comments]) - self.y_num_comments = self.posts_num_comments['num_comments'] - - # split data into train and test sets - self.X_train_num_comments, self.X_test_num_comments, self.y_train_num_comments, self.y_test_num_comments = train_test_split(self.X_num_comments, self.y_num_comments, test_size=0.2, random_state=42) - + # split data into train and test sets for num_comments + self.X_train_num_comments, self.X_test_num_comments, self.y_train_num_comments, self.y_test_num_comments = train_test_split(self.X, self.num_comments_df['num_comments'], test_size=0.2, random_state=10) + self.edit_textbox('Preparing Data (Number of Comments)', 2, 'done') # train models @@ -248,6 +238,10 @@ class ModelTrainer(customtkinter.CTkToplevel): self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'done') line_count += 1 + # dump the vectorizer + with open(self.model_dir + 'vectorizer.pkl', 'wb') as f: + pickle.dump(self.tfidf, f) + # save the metrics with open(self.model_dir + 'ups_metrics.json', 'w') as f: json.dump(self.ups_dict, f) diff --git a/src/windows/plotviewer.py b/src/windows/plotviewer.py index da334b6..635befa 100644 --- a/src/windows/plotviewer.py +++ b/src/windows/plotviewer.py @@ -295,7 +295,7 @@ class PlotViewer(customtkinter.CTk): def predict(self): # child window to take input of the post - title, selftext, subreddit, day, hour, distinguished - pred_win = Predict(self) + pred_win = Predict(self, self.model.get()) self.wait_window(pred_win) def train_models(self): diff --git a/src/windows/predict.py b/src/windows/predict.py index 374ee09..49402ff 100644 --- a/src/windows/predict.py +++ b/src/windows/predict.py @@ -1,24 +1,21 @@ -import random -import math import customtkinter import tkinter import pandas as pd import os import pickle -from scipy.sparse import hstack -from sklearn.feature_extraction.text import TfidfVectorizer -from sklearn.preprocessing import LabelBinarizer from .modeltrainer import preprocess import tkinter.messagebox +import joblib class Predict(customtkinter.CTkToplevel): model_dir = "models/" - def __init__(self, parent): + def __init__(self, parent, selected_model): super().__init__(parent) self.parent = parent self.title("Predictions") self.grab_set() self.focus_set() + self.selected_model = selected_model posx = int(self.winfo_screenwidth()/2 - 150) posy = int(self.winfo_screenheight()/2 - 350) self.geometry("300x650+{}+{}".format(posx, posy)) @@ -29,6 +26,9 @@ class Predict(customtkinter.CTkToplevel): self.title_label.pack(pady=5, padx=30, fill='x', side=tkinter.TOP, anchor='w') self.title_entry = customtkinter.CTkEntry(self, width=240) self.title_entry.pack(pady=10, padx=10) + + # uncomment the following lines to add title + self.title_entry.insert(0, 'A good openCV tutorial?') # selftext @@ -36,6 +36,9 @@ class Predict(customtkinter.CTkToplevel): self.selftext_label.pack(pady=5, padx=30, fill='x', side=tkinter.TOP, anchor='w') self.selftext_entry = customtkinter.CTkTextbox(self, width=240, height=100) self.selftext_entry.pack(pady=10, padx=10) + + # uncomment the following lines to add selftext + self.selftext_entry.insert("0.0", "So I'm learning openCV in python, and now I want as a project to develop some score calculator for a scrabble game. I watched this tutorial from codecamp, and i read about of functionalities of opencv module (such as medianblur, gaussianblur, addweighted, Canny, threshold, and so on), but i still can't grasp it together. Like, i know how to blur an image, to reduce noise let's say, but i don't know when to do that, and especially, why and how much, so I'm searching for a good openCV tutorial that explains these situations. \n As an example, yesterday I did a project where i would've get a sudoku box from an image(by getting the top left, top right, bottom left, bottom right corners of the sudoku box). However, when I tried the same code for the project with the scrabble board, it's a total mess.") # subreddit @@ -82,39 +85,40 @@ class Predict(customtkinter.CTkToplevel): day = self.day_entry.get() hour = self.hour_entry.get() hour = int(hour.split(':')[0]) - distinguished = self.distinguished_entry.get() + distinguished = False if self.distinguished_entry.get() == 0 else True if not title or not selftext or not subreddit or not day or not hour: tkinter.messagebox.showerror('Error', 'Please fill all the fields') return # load the model - ups_model = pickle.load(open(os.path.join(self.model_dir, "DummyRegressor_ups.pkl"), 'rb')) - num_comments_model = pickle.load(open(os.path.join(self.model_dir, "DummyRegressor_num_comments.pkl"), 'rb')) + ups_model = pickle.load(open(os.path.join(self.model_dir, self.selected_model + '_ups.pkl'), 'rb')) + num_comments_model = pickle.load(open(os.path.join(self.model_dir, self.selected_model + '_num_comments.pkl'), 'rb')) + + # load the vectorizer + vectorizer = joblib.load(os.path.join(self.model_dir, "vectorizer.pkl")) text = title + " " + selftext text = preprocess(text) - post = pd.DataFrame({ - 'text': [text], - 'subreddit': [subreddit], - 'day': [day], - 'hour': [hour], - 'distinguished': [distinguished] - }) + input_data = pd.DataFrame(columns=['text', 'day', 'hour', 'subreddit', 'distinguished']) + input_data = input_data.append({'text': text, 'day': day, 'hour': hour, 'subreddit': subreddit, 'distinguished': distinguished}, ignore_index=True) + + cat_cols = ['day', 'hour', 'subreddit', 'distinguished'] + for col in cat_cols: + input_data[col] = input_data[col].astype('category') + input_data = pd.get_dummies(input_data, columns=cat_cols) + input_data['text'] = input_data['text'].astype(str) - self.tfidf_vectorizer = TfidfVectorizer() - self.label_binarizer = LabelBinarizer() + X = vectorizer.transform(input_data['text']) - post_cat = [self.label_binarizer.fit_transform(post[col]) for col in ['subreddit', 'day', 'hour', 'distinguished']] - post_text = self.tfidf_vectorizer.fit_transform(post['text']) - postX = hstack([post_text] + post_cat).tocsr() + # predict the ups + ups = ups_model.predict(X) + ups = int(ups[0]) - ups = int(ups_model.predict(postX)[0]) - num_comments = int(num_comments_model.predict(postX)[0]) + # predict the num_comments + num_comments = num_comments_model.predict(X) + num_comments = int(num_comments[0]) - # random bias from the prediction - ups = int(ups // math.log(ups + 1)) - num_comments = int(num_comments // math.log(num_comments + 1)) + tkinter.messagebox.showinfo('Result', 'Predicted ups: {}\nPredicted num_comments: {}'.format(ups, num_comments)) - tkinter.messagebox.showinfo('Predictions', 'Predicted ups: {}\nPredicted num_comments: {}'.format(ups, num_comments)) |
