aboutsummaryrefslogtreecommitdiff
path: root/src/windows/predict.py
diff options
context:
space:
mode:
authorBobby <[email protected]>2022-12-03 22:55:09 -0500
committerGitHub <[email protected]>2022-12-03 22:55:09 -0500
commitd2550bdbbdd36550380ea07d8335ade7802d0f29 (patch)
tree02cc010a5094f26c6ef06e81b5f38cc22ab34c7f /src/windows/predict.py
parentecf0f94dad8147c4fe90622fbccbc2df355036be (diff)
parent432685c4972870a8119996335f53c08838661b80 (diff)
downloadRedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.tar.xz
RedditEngagementPrediction-d2550bdbbdd36550380ea07d8335ade7802d0f29.zip
Merge pull request #14 from luciferreeves/main
Fixed Text Prediction
Diffstat (limited to 'src/windows/predict.py')
-rw-r--r--src/windows/predict.py58
1 files changed, 31 insertions, 27 deletions
diff --git a/src/windows/predict.py b/src/windows/predict.py
index 374ee09..49402ff 100644
--- a/src/windows/predict.py
+++ b/src/windows/predict.py
@@ -1,24 +1,21 @@
-import random
-import math
import customtkinter
import tkinter
import pandas as pd
import os
import pickle
-from scipy.sparse import hstack
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.preprocessing import LabelBinarizer
from .modeltrainer import preprocess
import tkinter.messagebox
+import joblib
class Predict(customtkinter.CTkToplevel):
model_dir = "models/"
- def __init__(self, parent):
+ def __init__(self, parent, selected_model):
super().__init__(parent)
self.parent = parent
self.title("Predictions")
self.grab_set()
self.focus_set()
+ self.selected_model = selected_model
posx = int(self.winfo_screenwidth()/2 - 150)
posy = int(self.winfo_screenheight()/2 - 350)
self.geometry("300x650+{}+{}".format(posx, posy))
@@ -29,6 +26,9 @@ class Predict(customtkinter.CTkToplevel):
self.title_label.pack(pady=5, padx=30, fill='x', side=tkinter.TOP, anchor='w')
self.title_entry = customtkinter.CTkEntry(self, width=240)
self.title_entry.pack(pady=10, padx=10)
+
+ # uncomment the following lines to add title
+
self.title_entry.insert(0, 'A good openCV tutorial?')
# selftext
@@ -36,6 +36,9 @@ class Predict(customtkinter.CTkToplevel):
self.selftext_label.pack(pady=5, padx=30, fill='x', side=tkinter.TOP, anchor='w')
self.selftext_entry = customtkinter.CTkTextbox(self, width=240, height=100)
self.selftext_entry.pack(pady=10, padx=10)
+
+ # uncomment the following lines to add selftext
+
self.selftext_entry.insert("0.0", "So I'm learning openCV in python, and now I want as a project to develop some score calculator for a scrabble game. I watched this tutorial from codecamp, and i read about of functionalities of opencv module (such as medianblur, gaussianblur, addweighted, Canny, threshold, and so on), but i still can't grasp it together. Like, i know how to blur an image, to reduce noise let's say, but i don't know when to do that, and especially, why and how much, so I'm searching for a good openCV tutorial that explains these situations. \n As an example, yesterday I did a project where i would've get a sudoku box from an image(by getting the top left, top right, bottom left, bottom right corners of the sudoku box). However, when I tried the same code for the project with the scrabble board, it's a total mess.")
# subreddit
@@ -82,39 +85,40 @@ class Predict(customtkinter.CTkToplevel):
day = self.day_entry.get()
hour = self.hour_entry.get()
hour = int(hour.split(':')[0])
- distinguished = self.distinguished_entry.get()
+ distinguished = False if self.distinguished_entry.get() == 0 else True
if not title or not selftext or not subreddit or not day or not hour:
tkinter.messagebox.showerror('Error', 'Please fill all the fields')
return
# load the model
- ups_model = pickle.load(open(os.path.join(self.model_dir, "DummyRegressor_ups.pkl"), 'rb'))
- num_comments_model = pickle.load(open(os.path.join(self.model_dir, "DummyRegressor_num_comments.pkl"), 'rb'))
+ ups_model = pickle.load(open(os.path.join(self.model_dir, self.selected_model + '_ups.pkl'), 'rb'))
+ num_comments_model = pickle.load(open(os.path.join(self.model_dir, self.selected_model + '_num_comments.pkl'), 'rb'))
+
+ # load the vectorizer
+ vectorizer = joblib.load(os.path.join(self.model_dir, "vectorizer.pkl"))
text = title + " " + selftext
text = preprocess(text)
- post = pd.DataFrame({
- 'text': [text],
- 'subreddit': [subreddit],
- 'day': [day],
- 'hour': [hour],
- 'distinguished': [distinguished]
- })
+ input_data = pd.DataFrame(columns=['text', 'day', 'hour', 'subreddit', 'distinguished'])
+ input_data = input_data.append({'text': text, 'day': day, 'hour': hour, 'subreddit': subreddit, 'distinguished': distinguished}, ignore_index=True)
+
+ cat_cols = ['day', 'hour', 'subreddit', 'distinguished']
+ for col in cat_cols:
+ input_data[col] = input_data[col].astype('category')
+ input_data = pd.get_dummies(input_data, columns=cat_cols)
+ input_data['text'] = input_data['text'].astype(str)
- self.tfidf_vectorizer = TfidfVectorizer()
- self.label_binarizer = LabelBinarizer()
+ X = vectorizer.transform(input_data['text'])
- post_cat = [self.label_binarizer.fit_transform(post[col]) for col in ['subreddit', 'day', 'hour', 'distinguished']]
- post_text = self.tfidf_vectorizer.fit_transform(post['text'])
- postX = hstack([post_text] + post_cat).tocsr()
+ # predict the ups
+ ups = ups_model.predict(X)
+ ups = int(ups[0])
- ups = int(ups_model.predict(postX)[0])
- num_comments = int(num_comments_model.predict(postX)[0])
+ # predict the num_comments
+ num_comments = num_comments_model.predict(X)
+ num_comments = int(num_comments[0])
- # random bias from the prediction
- ups = int(ups // math.log(ups + 1))
- num_comments = int(num_comments // math.log(num_comments + 1))
+ tkinter.messagebox.showinfo('Result', 'Predicted ups: {}\nPredicted num_comments: {}'.format(ups, num_comments))
- tkinter.messagebox.showinfo('Predictions', 'Predicted ups: {}\nPredicted num_comments: {}'.format(ups, num_comments))