diff options
| author | Bobby <[email protected]> | 2022-11-14 19:55:08 -0500 |
|---|---|---|
| committer | Bobby <[email protected]> | 2022-11-14 19:55:08 -0500 |
| commit | 627c95a202c3353b18b87c8b89e296c32ad30a05 (patch) | |
| tree | 15c008f77f3e06a9696bec9b32fd26467f23862a | |
| parent | 47239f3ac39118b9bd8e078cd5a034087f4de49e (diff) | |
| download | RedditEngagementPrediction-627c95a202c3353b18b87c8b89e296c32ad30a05.tar.xz RedditEngagementPrediction-627c95a202c3353b18b87c8b89e296c32ad30a05.zip | |
update
| -rw-r--r-- | project.ipynb | 31 |
1 files changed, 28 insertions, 3 deletions
diff --git a/project.ipynb b/project.ipynb index 17c503f..23b2c35 100644 --- a/project.ipynb +++ b/project.ipynb @@ -1588,6 +1588,21 @@ ] }, { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Machine Learning Models\n", + "\n", + "- DummyRegressor\n", + "- LinearRegression\n", + "- RidgeCV\n", + "- KNeighborsRegressor\n", + "- DecisionTreeRegressor\n", + "- RandomForestRegressor\n", + "- GradientBoostingRegressor\n" + ] + }, + { "cell_type": "code", "execution_count": 1089, "metadata": {}, @@ -1624,21 +1639,24 @@ "metadata": {}, "outputs": [], "source": [ + "# Import the necessary libraries\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.preprocessing import LabelBinarizer\n", "from scipy.sparse import hstack\n", "\n", - "TfidfVectorizer = TfidfVectorizer()\n", + "\n", + "TfIdfVectorizer = TfidfVectorizer()\n", "label_binarizer = LabelBinarizer()\n", "\n", + "# Generate the tf-idf vectors for the text features and binarize the categorical features\n", "num_comments_cat =[label_binarizer.fit_transform(df[col]) for col in categorical_features]\n", - "num_comments_tfidf = TfidfVectorizer.fit_transform(df['text'])\n", + "num_comments_tfidf = TfIdfVectorizer.fit_transform(df['text'])\n", "num_comments_X = hstack([num_comments_tfidf] + num_comments_cat).tocsr()\n", "num_comments_y = df_num_comments['num_comments']\n", "\n", "ups_cat =[label_binarizer.fit_transform(df[col]) for col in categorical_features]\n", - "ups_tfidf = TfidfVectorizer.fit_transform(df['text'])\n", + "ups_tfidf = TfIdfVectorizer.fit_transform(df['text'])\n", "ups_X = hstack([ups_tfidf] + ups_cat).tocsr()\n", "ups_y = df_ups['ups']\n", "\n", @@ -1664,6 +1682,7 @@ } ], "source": [ + "# Shape of the dataframes\n", "X_train_ups.shape, X_test_ups.shape, y_train_ups.shape, y_test_ups.shape" ] }, @@ -1684,6 +1703,7 @@ } ], "source": [ + "# Shapa la frada so data = :)\n", "X_train_num_comments.shape, X_test_num_comments.shape, y_train_num_comments.shape, y_test_num_comments.shape" ] }, @@ -1693,6 +1713,7 @@ "metadata": {}, "outputs": [], "source": [ + "# Impot the necessary libraries \n", "import numpy as np\n", "from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error\n", "from sklearn.dummy import DummyRegressor\n", @@ -1703,6 +1724,7 @@ "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "\n", + "# Create a function to evaluate the models\n", "def model_diagnostics(model, X_test, y_test, pr=True):\n", " \"\"\"\n", " Returns and prints the R-squared, RMSE and the MAE for a trained model\n", @@ -1787,6 +1809,7 @@ "import os\n", "import pickle\n", "\n", + "# Create a function to save the models\n", "def save_model(model, model_name):\n", " \"\"\"\n", " Saves the model to the models/ directory\n", @@ -1797,6 +1820,7 @@ " pickle.dump(model, f)\n", "\n", "\n", + "# Create a hash table to store the model objects\n", "model_hashmap = {\n", " \"DummyRegressor\": DummyRegressor(),\n", " \"LinearRegression\": LinearRegression(),\n", @@ -1807,6 +1831,7 @@ " \"GradientBoostingRegressor\": GradientBoostingRegressor(n_estimators=70, max_depth=5)\n", "}\n", "\n", + "# Train the models\n", "for model_name, model in model_hashmap.items():\n", " print(\"Processing model: \", model_name)\n", "\n", |
