aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--project.ipynb31
1 files changed, 28 insertions, 3 deletions
diff --git a/project.ipynb b/project.ipynb
index 17c503f..23b2c35 100644
--- a/project.ipynb
+++ b/project.ipynb
@@ -1588,6 +1588,21 @@
]
},
{
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Machine Learning Models\n",
+ "\n",
+ "- DummyRegressor\n",
+ "- LinearRegression\n",
+ "- RidgeCV\n",
+ "- KNeighborsRegressor\n",
+ "- DecisionTreeRegressor\n",
+ "- RandomForestRegressor\n",
+ "- GradientBoostingRegressor\n"
+ ]
+ },
+ {
"cell_type": "code",
"execution_count": 1089,
"metadata": {},
@@ -1624,21 +1639,24 @@
"metadata": {},
"outputs": [],
"source": [
+ "# Import the necessary libraries\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.preprocessing import LabelBinarizer\n",
"from scipy.sparse import hstack\n",
"\n",
- "TfidfVectorizer = TfidfVectorizer()\n",
+ "\n",
+ "TfIdfVectorizer = TfidfVectorizer()\n",
"label_binarizer = LabelBinarizer()\n",
"\n",
+ "# Generate the tf-idf vectors for the text features and binarize the categorical features\n",
"num_comments_cat =[label_binarizer.fit_transform(df[col]) for col in categorical_features]\n",
- "num_comments_tfidf = TfidfVectorizer.fit_transform(df['text'])\n",
+ "num_comments_tfidf = TfIdfVectorizer.fit_transform(df['text'])\n",
"num_comments_X = hstack([num_comments_tfidf] + num_comments_cat).tocsr()\n",
"num_comments_y = df_num_comments['num_comments']\n",
"\n",
"ups_cat =[label_binarizer.fit_transform(df[col]) for col in categorical_features]\n",
- "ups_tfidf = TfidfVectorizer.fit_transform(df['text'])\n",
+ "ups_tfidf = TfIdfVectorizer.fit_transform(df['text'])\n",
"ups_X = hstack([ups_tfidf] + ups_cat).tocsr()\n",
"ups_y = df_ups['ups']\n",
"\n",
@@ -1664,6 +1682,7 @@
}
],
"source": [
+ "# Shape of the dataframes\n",
"X_train_ups.shape, X_test_ups.shape, y_train_ups.shape, y_test_ups.shape"
]
},
@@ -1684,6 +1703,7 @@
}
],
"source": [
+ "# Shapa la frada so data = :)\n",
"X_train_num_comments.shape, X_test_num_comments.shape, y_train_num_comments.shape, y_test_num_comments.shape"
]
},
@@ -1693,6 +1713,7 @@
"metadata": {},
"outputs": [],
"source": [
+ "# Impot the necessary libraries \n",
"import numpy as np\n",
"from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
"from sklearn.dummy import DummyRegressor\n",
@@ -1703,6 +1724,7 @@
"from sklearn.ensemble import RandomForestRegressor\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"\n",
+ "# Create a function to evaluate the models\n",
"def model_diagnostics(model, X_test, y_test, pr=True):\n",
" \"\"\"\n",
" Returns and prints the R-squared, RMSE and the MAE for a trained model\n",
@@ -1787,6 +1809,7 @@
"import os\n",
"import pickle\n",
"\n",
+ "# Create a function to save the models\n",
"def save_model(model, model_name):\n",
" \"\"\"\n",
" Saves the model to the models/ directory\n",
@@ -1797,6 +1820,7 @@
" pickle.dump(model, f)\n",
"\n",
"\n",
+ "# Create a hash table to store the model objects\n",
"model_hashmap = {\n",
" \"DummyRegressor\": DummyRegressor(),\n",
" \"LinearRegression\": LinearRegression(),\n",
@@ -1807,6 +1831,7 @@
" \"GradientBoostingRegressor\": GradientBoostingRegressor(n_estimators=70, max_depth=5)\n",
"}\n",
"\n",
+ "# Train the models\n",
"for model_name, model in model_hashmap.items():\n",
" print(\"Processing model: \", model_name)\n",
"\n",