1 files changed, 28 insertions, 3 deletions
diff --git a/project.ipynb b/project.ipynb
index 17c503f..23b2c35 100644
--- a/project.ipynb
+++ b/project.ipynb
@@ -1588,6 +1588,21 @@
    ]
   },
   {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Machine Learning Models\n",
+    "\n",
+    "- DummyRegressor\n",
+    "- LinearRegression\n",
+    "- RidgeCV\n",
+    "- KNeighborsRegressor\n",
+    "- DecisionTreeRegressor\n",
+    "- RandomForestRegressor\n",
+    "- GradientBoostingRegressor\n"
+   ]
+  },
+  {
    "cell_type": "code",
    "execution_count": 1089,
    "metadata": {},
@@ -1624,21 +1639,24 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Import the necessary libraries\n",
     "from sklearn.model_selection import train_test_split\n",
     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
     "from sklearn.preprocessing import LabelBinarizer\n",
     "from scipy.sparse import hstack\n",
     "\n",
-    "TfidfVectorizer = TfidfVectorizer()\n",
+    "\n",
+    "TfIdfVectorizer = TfidfVectorizer()\n",
     "label_binarizer = LabelBinarizer()\n",
     "\n",
+    "# Generate the tf-idf vectors for the text features and binarize the categorical features\n",
     "num_comments_cat =[label_binarizer.fit_transform(df[col]) for col in categorical_features]\n",
-    "num_comments_tfidf = TfidfVectorizer.fit_transform(df['text'])\n",
+    "num_comments_tfidf = TfIdfVectorizer.fit_transform(df['text'])\n",
     "num_comments_X = hstack([num_comments_tfidf] + num_comments_cat).tocsr()\n",
     "num_comments_y = df_num_comments['num_comments']\n",
     "\n",
     "ups_cat =[label_binarizer.fit_transform(df[col]) for col in categorical_features]\n",
-    "ups_tfidf = TfidfVectorizer.fit_transform(df['text'])\n",
+    "ups_tfidf = TfIdfVectorizer.fit_transform(df['text'])\n",
     "ups_X = hstack([ups_tfidf] + ups_cat).tocsr()\n",
     "ups_y = df_ups['ups']\n",
     "\n",
@@ -1664,6 +1682,7 @@
     }
    ],
    "source": [
+    "# Shape of the dataframes\n",
     "X_train_ups.shape, X_test_ups.shape, y_train_ups.shape, y_test_ups.shape"
    ]
   },
@@ -1684,6 +1703,7 @@
     }
    ],
    "source": [
+    "# Shapa la frada so data = :)\n",
     "X_train_num_comments.shape, X_test_num_comments.shape, y_train_num_comments.shape, y_test_num_comments.shape"
    ]
   },
@@ -1693,6 +1713,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "# Impot the necessary libraries \n",
     "import numpy as np\n",
     "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n",
     "from sklearn.dummy import DummyRegressor\n",
@@ -1703,6 +1724,7 @@
     "from sklearn.ensemble import RandomForestRegressor\n",
     "from sklearn.ensemble import GradientBoostingRegressor\n",
     "\n",
+    "# Create a function to evaluate the models\n",
     "def model_diagnostics(model, X_test, y_test, pr=True):\n",
     "    \"\"\"\n",
     "    Returns and prints the R-squared, RMSE and the MAE for a trained model\n",
@@ -1787,6 +1809,7 @@
     "import os\n",
     "import pickle\n",
     "\n",
+    "# Create a function to save the models\n",
     "def save_model(model, model_name):\n",
     "    \"\"\"\n",
     "    Saves the model to the models/ directory\n",
@@ -1797,6 +1820,7 @@
     "        pickle.dump(model, f)\n",
     "\n",
     "\n",
+    "# Create a hash table to store the model objects\n",
     "model_hashmap = {\n",
     "    \"DummyRegressor\": DummyRegressor(),\n",
     "    \"LinearRegression\": LinearRegression(),\n",
@@ -1807,6 +1831,7 @@
     "    \"GradientBoostingRegressor\": GradientBoostingRegressor(n_estimators=70, max_depth=5)\n",
     "}\n",
     "\n",
+    "# Train the models\n",
     "for model_name, model in model_hashmap.items():\n",
     "    print(\"Processing model: \", model_name)\n",
     "\n",