import html import os import pickle import re import warnings import customtkinter import nltk from nltk.corpus import stopwords from nltk.stem import SnowballStemmer from sklearn.feature_extraction.text import TfidfVectorizer import ssl import json from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error try: _create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = _create_unverified_https_context warnings.filterwarnings('ignore') nltk.download('stopwords') nltk.download('wordnet') from string import punctuation import pandas as pd import numpy as np from sklearn.dummy import DummyRegressor from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.linear_model import LinearRegression, RidgeCV from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor from sklearn.tree import DecisionTreeRegressor def preprocess(message): stemmer = SnowballStemmer('english') stuff_to_be_removed = list(stopwords.words('english'))+list(punctuation) # Convert message to lower case message = message.lower() # Remove all the links from the messages message = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\ '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', message) # Remove all the mentions message =re.sub("(@[A-Za-z0-9_]+)","", message) # Remove all the emojis message = re.sub(re.compile("[" u"\U0001F600-\U0001F64F" # emoticons u"\U0001F300-\U0001F5FF" # symbols & pictographs u"\U0001F680-\U0001F6FF" # transport & map symbols u"\U0001F1E0-\U0001F1FF" # flags (iOS) u"\U00002500-\U00002BEF" # chinese char u"\U00002702-\U000027B0" u"\U00002702-\U000027B0" u"\U000024C2-\U0001F251" u"\U0001f926-\U0001f937" u"\U00010000-\U0010ffff" u"\u2640-\u2642" u"\u2600-\u2B55" u"\u200d" u"\u23cf" u"\u23e9" u"\u231a" u"\ufe0f" # dingbats u"\u3030" "]+", flags=re.UNICODE), '', message) # Remove HTML entities message = html.unescape(message) # strip blank spaces message = message.strip() # Remove all the punctuations message = message.translate(str.maketrans('', '', punctuation)) # Remove stopwords and perform stemming message = ' '.join([stemmer.stem(word) for word in message.split() if word not in stuff_to_be_removed]) # Return the message return message class ModelTrainer(customtkinter.CTkToplevel): model_dir = 'models/' def __init__(self, parent, posts): super().__init__(parent) self.parent = parent self.posts = posts # posts is already a dataframe self.features = ['title', 'selftext', 'subreddit', 'distinguished', 'hour', 'day'] self.targets = ['ups', 'num_comments'] for col in self.posts.columns: if col not in self.features + self.targets: self.posts.drop(col, axis=1, inplace=True) self.categorical_features = ['subreddit', 'distinguished', 'hour', 'day'] self.posts['text'] = self.posts['title'] + ' ' + self.posts['selftext'] self.posts['text'] = self.posts['text'].astype(str) self.posts['text'] = self.posts['text'].apply(lambda x: preprocess(x)) self.posts.drop(['title', 'selftext'], axis=1, inplace=True) # convert categorical features for col in self.categorical_features: self.posts[col] = self.posts[col].astype('category') self.posts = pd.get_dummies(self.posts, columns=self.categorical_features) self.text_features = ['text'] self.title('Reddit Data Analysis - Building Models') posx = int(self.winfo_screenwidth()/2 - 300) posy = int(self.winfo_screenheight()/2 - 150) self.geometry('600x300+{}+{}'.format(posx, posy)) self.resizable(False, False) self.protocol('WM_DELETE_WINDOW', self.disable_event) self.updates = customtkinter.CTkTextbox(self, height=300, width=600, state = 'disabled') self.updates.pack(fill='both', expand=True) # Create a hash table to store the model objects self.model_hashmap = { "DummyRegressor": DummyRegressor(), "LinearRegression": LinearRegression(), "RidgeCV": RidgeCV(cv=10), "KNeighborsRegressor": KNeighborsRegressor(), "DecisionTreeRegressor": DecisionTreeRegressor(min_samples_split=45, min_samples_leaf=45, random_state = 10), "RandomForestRegressor": RandomForestRegressor(n_jobs=-1, n_estimators=70, min_samples_leaf=10, random_state = 10), "GradientBoostingRegressor": GradientBoostingRegressor(n_estimators=70, max_depth=5) } self.ups_dict = {} self.num_comments_dict = {} self.start() def disable_event(self): pass def edit_textbox(self, text, line, type='wait'): emoji = '🕐' if type == 'wait' else '✅' line_next = line + 1 line = str(line) + '.0' line_next = str(line_next) + '.0' self.updates.configure(state='normal') if type == 'wait': self.updates.insert(line, emoji + ' ' + text + '...' + '\n\n') else: self.updates.delete(line, line_next) self.updates.insert(line, emoji + ' ' + text + '\n\n') self.updates.configure(state='disabled') # scroll to line self.updates.see(line) # update the window self.update() def start(self): self.tfidf = TfidfVectorizer() self.X = self.tfidf.fit_transform(self.posts['text']) self.edit_textbox('Preparing Data (Upvotes)', 1, 'wait') # dataframes for ups self.ups_df = self.posts.drop(['num_comments'], axis=1) # split data into train and test sets for ups self.X_train_ups, self.X_test_ups, self.y_train_ups, self.y_test_ups = train_test_split(self.X, self.ups_df['ups'], test_size=0.2, random_state=10) self.edit_textbox('Preparing Data (Upvotes)', 1, 'done') self.edit_textbox('Preparing Data (Number of Comments)', 3, 'wait') # dataframes for num_comments self.num_comments_df = self.posts.drop(['ups'], axis=1) # split data into train and test sets for num_comments self.X_train_num_comments, self.X_test_num_comments, self.y_train_num_comments, self.y_test_num_comments = train_test_split(self.X, self.num_comments_df['num_comments'], test_size=0.2, random_state=10) self.edit_textbox('Preparing Data (Number of Comments)', 2, 'done') # train models self.train_models() # Create a function to save the models def save_model(self, model, model_name): """ Saves the model to the models/ directory """ if not os.path.exists(self.model_dir): os.mkdir(self.model_dir) with open(self.model_dir + model_name + '.pkl', 'wb') as f: pickle.dump(model, f) def train_models(self): line_count = 3 for model_name, model in self.model_hashmap.items(): self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'wait') model.fit(self.X_train_ups, self.y_train_ups) self.save_model(model, model_name + '_ups') ups_y_pred = model.predict(self.X_test_ups) ups_mse = mean_squared_error(self.y_test_ups, ups_y_pred) ups_mae = mean_absolute_error(self.y_test_ups, ups_y_pred) ups_r2 = r2_score(self.y_test_ups, ups_y_pred) self.ups_dict[model_name] = { 'mse': ups_mse, 'mae': ups_mae, 'r2': ups_r2, 'rmse': np.sqrt(ups_mse), 'pred': list(ups_y_pred), 'actual': list(self.y_test_ups) } self.edit_textbox('Training {} for Upvotes'.format(model_name), line_count, 'done') line_count += 1 self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'wait') model.fit(self.X_train_num_comments, self.y_train_num_comments) self.save_model(model, model_name + '_num_comments') num_comments_y_pred = model.predict(self.X_test_num_comments) num_comments_mse = mean_squared_error(self.y_test_num_comments, num_comments_y_pred) num_comments_mae = mean_absolute_error(self.y_test_num_comments, num_comments_y_pred) num_comments_r2 = r2_score(self.y_test_num_comments, num_comments_y_pred) self.num_comments_dict[model_name] = { 'mse': num_comments_mse, 'mae': num_comments_mae, 'r2': num_comments_r2, 'rmse': np.sqrt(num_comments_mse), 'pred': list(num_comments_y_pred), 'actual': list(self.y_test_num_comments) } self.edit_textbox('Training {} for Number of Comments'.format(model_name), line_count, 'done') line_count += 1 # dump the vectorizer with open(self.model_dir + 'vectorizer.pkl', 'wb') as f: pickle.dump(self.tfidf, f) # save the metrics with open(self.model_dir + 'ups_metrics.json', 'w') as f: json.dump(self.ups_dict, f) with open(self.model_dir + 'num_comments_metrics.json', 'w') as f: json.dump(self.num_comments_dict, f) self.edit_textbox('Training Complete. Models saved to models/ directory. You may now close this window.', line_count, 'done') # allow user to close window self.protocol("WM_DELETE_WINDOW", self.enable_close) def enable_close(self): self.destroy()