aboutsummaryrefslogtreecommitdiff
path: root/__dbscripts__
diff options
context:
space:
mode:
authorPriyansh <[email protected]>2021-12-09 03:01:49 -0500
committerPriyansh <[email protected]>2021-12-09 03:01:49 -0500
commit3b48c04470e2c368ba18716bb789b40942d6989f (patch)
tree1cb97d5271f2233fbf9d0ef864b91fc8a61efb17 /__dbscripts__
parent22e1d761027501bfa59b92776cf1c13eef3a0004 (diff)
downloadtemp_pred_arima-3b48c04470e2c368ba18716bb789b40942d6989f.tar.xz
temp_pred_arima-3b48c04470e2c368ba18716bb789b40942d6989f.zip
changed project structure
Diffstat (limited to '__dbscripts__')
-rw-r--r--__dbscripts__/database_creation_script.py107
-rw-r--r--__dbscripts__/dbcreation.ipynb460
2 files changed, 567 insertions, 0 deletions
diff --git a/__dbscripts__/database_creation_script.py b/__dbscripts__/database_creation_script.py
new file mode 100644
index 0000000..c0e9252
--- /dev/null
+++ b/__dbscripts__/database_creation_script.py
@@ -0,0 +1,107 @@
+# Running this script automatically creates the database and normalizes the data.
+# This is same as running the 'dbcreation.ipynb' notebook step by step.
+# However their are some visualizations in the notebook but this script does not.
+# This script is intentended to automate the whole process in a single script.
+# This should run faster than the notebook.
+
+import pandas as pd
+import sqlite3
+from sqlite3 import Error
+import os
+
+def create_connection(db_file, delete_db=False):
+
+ if delete_db and os.path.exists(db_file):
+ os.remove(db_file)
+
+ conn = None
+ try:
+ conn = sqlite3.connect(db_file)
+ conn.execute("PRAGMA foreign_keys = 1")
+ except Error as e:
+ print(e)
+
+ return conn
+
+def create_table(conn, create_table_sql):
+ try:
+ c = conn.cursor()
+ c.execute(create_table_sql)
+ except Error as e:
+ print(e)
+
+def execute_sql_statement(sql_statement, conn):
+ cur = conn.cursor()
+ cur.execute(sql_statement)
+
+ rows = cur.fetchall()
+
+ return rows
+
+df=pd.read_csv("..GlobalLandTemperaturesByCity.csv")
+data=df[df.dt>"1949-12-31"]
+data_for_loc_table=data[['City','Latitude','Longitude']]
+data_for_loc_table=data_for_loc_table.to_records(index=False)
+result_data_for_loc_table=list(data_for_loc_table)
+records=data.to_records(index=False)
+result=list(records)
+req_resut=[]
+for i in result:
+ req_resut.append((i[4],))
+req_resut=list((set(req_resut)))
+conn = create_connection('database.db')
+cur=conn.cursor()
+execute_sql_statement("DROP TABLE IF EXISTS City_Table",conn)
+execute_sql_statement("DROP TABLE IF EXISTS Country_Table",conn)
+create_table(conn,'''CREATE TABLE Country_Table (PK INTEGER PRIMARY KEY, Country_Name VARCHAR)''')
+cur.executemany("INSERT INTO Country_Table ('Country_Name') Values (?)",req_resut)
+abc=execute_sql_statement("Select * from Country_Table", conn)
+abc1={}
+for i in abc:
+ abc1[i[1]]=i[0]
+req_result_2=[]
+for i in result:
+ req_result_2.append((i[3],abc1[i[4]]))
+create_table(conn,'''CREATE TABLE City_Table (City_Id INTEGER PRIMARY KEY, City_Name VARCHAR , Country_ID INTEGER,
+FOREIGN KEY(Country_ID) REFERENCES Country_Table(PK))''')
+city=[]
+for i in result:
+ city.append((i[3],abc1[i[4]]))
+city_country_unique=list((set(city)))
+cur.executemany("INSERT INTO City_Table ('City_Name','Country_ID') Values (?,?)",city_country_unique)
+city_index=execute_sql_statement("Select * from City_Table", conn)
+city_indexes={}
+for i in abc:
+ abc1[i[1]]=i[0]
+city_table_values=execute_sql_statement("select * from City_Table ",conn)
+city_ids={}
+for i in city_table_values:
+ city_ids[i[1]]=i[0]
+create_table(conn,'''CREATE TABLE Temperature (PK INTEGER PRIMARY KEY , Date VARCHAR, Avg_Temperature VARCHAR, City_ID INTEGER,
+FOREIGN KEY(City_ID) REFERENCES City_Table(City_Id))''')
+temp_table_values=[]
+j=1
+for i in result:
+ j=j+1
+ temp_table_values.append((i[0],i[1],city_ids[i[3]]))
+cur.executemany("INSERT INTO Temperature ('Date','Avg_Temperature','City_ID') Values (?,?,?)",temp_table_values)
+create_table(conn,'''CREATE TABLE Loc_Table (ID INTEGER PRIMARY KEY , City_ID INTEGER, LAT FLOAT,LONG FLOAT,
+FOREIGN KEY(City_ID) REFERENCES City_Table(City_ID))''')
+data_to_check_loc=result_data_for_loc_table
+data_to_input_loc_table=[]
+j=0
+for i in data_to_check_loc:
+ t=[0,0,0]
+ if i[1][-1]=='N':
+ t[1]=(float(i[1][:-1]))
+ if i[1][-1]=='S':
+ t[1]=float(i[1][:-1])*(-1)
+ if i[2][-1]=='E':
+ t[2]=float(i[2][:-1])
+ if i[2][-1]=='W':
+ t[2]=float(i[2][:-1])*(-1)
+ t[0]=i[0]
+ tuple1=(city_ids[t[0]],t[1],t[2])
+ data_to_input_loc_table.append(tuple1)
+cur.executemany("INSERT INTO Loc_Table ('City_ID','LAT','LONG') Values (?,?,?)",data_to_input_loc_table)
+conn.commit()
diff --git a/__dbscripts__/dbcreation.ipynb b/__dbscripts__/dbcreation.ipynb
new file mode 100644
index 0000000..d4f34d1
--- /dev/null
+++ b/__dbscripts__/dbcreation.ipynb
@@ -0,0 +1,460 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "660de5e8",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Imports\n",
+ "\n",
+ "from IPython.display import display, HTML\n",
+ "import pandas as pd\n",
+ "import sqlite3\n",
+ "from sqlite3 import Error\n",
+ "import os"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1dc168cb",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Function to create a connection to the database\n",
+ "\n",
+ "def create_connection(db_file, delete_db=False):\n",
+ " \n",
+ " if delete_db and os.path.exists(db_file):\n",
+ " os.remove(db_file)\n",
+ "\n",
+ " conn = None\n",
+ " try:\n",
+ " conn = sqlite3.connect(db_file)\n",
+ " conn.execute(\"PRAGMA foreign_keys = 1\")\n",
+ " except Error as e:\n",
+ " print(e)\n",
+ "\n",
+ " return conn"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "910875f4",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Function to create a database\n",
+ "\n",
+ "def create_table(conn, create_table_sql):\n",
+ " try:\n",
+ " c = conn.cursor()\n",
+ " c.execute(create_table_sql)\n",
+ " except Error as e:\n",
+ " print(e)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "36907a1c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Execute SQL statement\n",
+ "\n",
+ "def execute_sql_statement(sql_statement, conn):\n",
+ " cur = conn.cursor()\n",
+ " cur.execute(sql_statement)\n",
+ "\n",
+ " rows = cur.fetchall()\n",
+ "\n",
+ " return rows"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "a0dfa189",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Read the Non-Normalized Data\n",
+ "\n",
+ "df=pd.read_csv(\"..GlobalLandTemperaturesByCity.csv\")\n",
+ "data=df[df.dt>\"1949-12-31\"]\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "77341f9c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract cities and their latitude and longitude\n",
+ "\n",
+ "data_for_loc_table=data[['City','Latitude','Longitude']]\n",
+ "data_for_loc_table=data_for_loc_table.to_records(index=False)\n",
+ "result_data_for_loc_table=list(data_for_loc_table)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e635279c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract Countries\n",
+ "\n",
+ "records=data.to_records(index=False)\n",
+ "result=list(records)\n",
+ "req_resut=[]\n",
+ "for i in result:\n",
+ " req_resut.append((i[4],))\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "74c13a5f",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Convert the list of cities into a list with unique names\n",
+ "\n",
+ "req_resut=list((set(req_resut)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "21eca337",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create a connection to the database\n",
+ "conn = create_connection('database.db')\n",
+ "cur=conn.cursor()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "af74cdef",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Drop City Table and the Country Table if they exist and create a new Country Table and Insert the data\n",
+ "\n",
+ "execute_sql_statement(\"DROP TABLE IF EXISTS City_Table\",conn)\n",
+ "execute_sql_statement(\"DROP TABLE IF EXISTS Country_Table\",conn)\n",
+ "create_table(conn,'''CREATE TABLE Country_Table (PK INTEGER PRIMARY KEY, Country_Name VARCHAR)''')\n",
+ "cur.executemany(\"INSERT INTO Country_Table ('Country_Name') Values (?)\",req_resut)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "59930452",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Make a dictionary of the country data and their IDs\n",
+ "abc=execute_sql_statement(\"Select * from Country_Table\", conn)\n",
+ "abc1={}\n",
+ "for i in abc:\n",
+ " abc1[i[1]]=i[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "52c77691",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "req_result_2=[]\n",
+ "for i in result:\n",
+ " req_result_2.append((i[3],abc1[i[4]]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f5b64360",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the City Table\n",
+ "create_table(conn,'''CREATE TABLE City_Table (City_Id INTEGER PRIMARY KEY, City_Name VARCHAR , Country_ID INTEGER,\n",
+ "FOREIGN KEY(Country_ID) REFERENCES Country_Table(PK))''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1206d273",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract the data for City Table\n",
+ "city=[]\n",
+ "for i in result:\n",
+ " # j=j+1\n",
+ " city.append((i[3],abc1[i[4]]))\n",
+ "city_country_unique=list((set(city)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cfca5230",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Insert the data for City Table\n",
+ "cur.executemany(\"INSERT INTO City_Table ('City_Name','Country_ID') Values (?,?)\",city_country_unique)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d9f71f49",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#df=execute_sql_statement(\"Select * from City_Table\", conn)\n",
+ "df=pd.read_sql_query(\"Select * from City_Table\", conn)\n",
+ "display(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3692d2ea",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Build a dictionary of the city data and their IDs\n",
+ "city_index=execute_sql_statement(\"Select * from City_Table\", conn)\n",
+ "city_indexes={}\n",
+ "for i in abc:\n",
+ " abc1[i[1]]=i[0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "9dd4ef62",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Displaying the country data\n",
+ "df1=pd.read_sql_query(\"Select * from Country_Table\", conn)\n",
+ "display(df1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4c2a9868",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Join the data from the City Table and the Country Table and display the data\n",
+ "\n",
+ "execute_sql_statement(\"select * from City_Table join Country_Table on Country_Table.PK = City_Table.Country_ID AND Country_Table.Country_Name='India' LIMIT 10\",conn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "3932f5a5",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# City IDs Dictionary\n",
+ "\n",
+ "city_table_values=execute_sql_statement(\"select * from City_Table \",conn)\n",
+ "city_ids={}\n",
+ "for i in city_table_values:\n",
+ " city_ids[i[1]]=i[0]\n",
+ "city_ids\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fcd18ad9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the Temperature Table\n",
+ "\n",
+ "create_table(conn,'''CREATE TABLE Temperature (PK INTEGER PRIMARY KEY , Date VARCHAR, Avg_Temperature VARCHAR, City_ID INTEGER,\n",
+ "FOREIGN KEY(City_ID) REFERENCES City_Table(City_Id))''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e1a0d17e",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Allocate the data to the Temperature Table and their corresponding City IDs\n",
+ "\n",
+ "temp_table_values=[]\n",
+ "j=1\n",
+ "for i in result:\n",
+ " j=j+1\n",
+ " temp_table_values.append((i[0],i[1],city_ids[i[3]]))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "fb218911",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Insert the data into the Temperature Table\n",
+ "\n",
+ "cur.executemany(\"INSERT INTO Temperature ('Date','Avg_Temperature','City_ID') Values (?,?,?)\",temp_table_values)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8e34f625",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Displaying the Temperature Table\n",
+ "\n",
+ "df1=pd.read_sql_query(\"Select * from Temperature\", conn)\n",
+ "display(df1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f08f8148",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Create the Location Table\n",
+ "\n",
+ "create_table(conn,'''CREATE TABLE Loc_Table (ID INTEGER PRIMARY KEY , City_ID INTEGER, LAT FLOAT,LONG FLOAT,\n",
+ "FOREIGN KEY(City_ID) REFERENCES City_Table(City_ID))''')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ee98ee19",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Extract the data for the Location Table, Database consists of N,W,E,S notation, we are going to convert them to the decimal notation\n",
+ "\n",
+ "#Lat= North is positive, South is negative\n",
+ "#Long= West is negative, East is positive\n",
+ "data_to_check_loc=result_data_for_loc_table\n",
+ "data_to_input_loc_table=[]\n",
+ "j=0\n",
+ "for i in data_to_check_loc:\n",
+ " t=[0,0,0]\n",
+ " if i[1][-1]=='N':\n",
+ " t[1]=(float(i[1][:-1]))\n",
+ " if i[1][-1]=='S':\n",
+ " t[1]=float(i[1][:-1])*(-1)\n",
+ " if i[2][-1]=='E':\n",
+ " t[2]=float(i[2][:-1]) \n",
+ " if i[2][-1]=='W':\n",
+ " t[2]=float(i[2][:-1])*(-1)\n",
+ " t[0]=i[0]\n",
+ " tuple1=(city_ids[t[0]],t[1],t[2])\n",
+ " data_to_input_loc_table.append(tuple1)\n",
+ "data_to_input_loc_table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cc0fba0c",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Insert the data into the Location Table\n",
+ "\n",
+ "cur.executemany(\"INSERT INTO Loc_Table ('City_ID','LAT','LONG') Values (?,?,?)\",data_to_input_loc_table)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "aead2755",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Displaying the Location Table\n",
+ "\n",
+ "df1=pd.read_sql_query(\"Select * from Loc_Table \", conn)\n",
+ "display(df1)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "f67e8d5b",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# df1=pd.read_sql_query(\"select * from Loc_Table join Temperature on Loc_Table.City_ID=Temperature.City_ID join city_table on city_table.city_id=Temperature.City_ID join country_table on country_table.pk=city_table.country_id group by Date LIMIT 10\", conn)\n",
+ "# display(df1)\n",
+ "#(\"select * from Loc_Table join Temperature on Loc_Table.City_ID=Temperature.City_ID join city_table on city_table.city_id=Temperature.City_ID join country_table on country_table.pk=city_table.country_id LIMIT 10\",conn)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cbc5ad63",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Commit the changes\n",
+ "\n",
+ "conn.commit()"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}