diff --git a/SQL_python_connection.ipynb b/SQL_python_connection.ipynb new file mode 100644 index 0000000..745a7cb --- /dev/null +++ b/SQL_python_connection.ipynb @@ -0,0 +1,1033 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "59ab424e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: sqlalchemy in /opt/anaconda3/lib/python3.13/site-packages (2.0.39)\n", + "Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.13/site-packages (2.2.3)\n", + "Requirement already satisfied: mysql-connector-python in /opt/anaconda3/lib/python3.13/site-packages (9.5.0)\n", + "Requirement already satisfied: typing-extensions>=4.6.0 in /opt/anaconda3/lib/python3.13/site-packages (from sqlalchemy) (4.15.0)\n", + "Requirement already satisfied: numpy>=1.26.0 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2.1.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2025.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "pip install sqlalchemy pandas mysql-connector-python" + ] + }, + { + "cell_type": "markdown", + "id": "05ae9bda", + "metadata": {}, + "source": [ + "Importing the library sqlalchemy for My sql" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a98c9515", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine" + ] + }, + { + "cell_type": "markdown", + "id": "86f9c6fc", + "metadata": {}, + "source": [ + "Challenge - 1" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e123eb11", + "metadata": {}, + "outputs": [], + "source": [ + "engine = create_engine(\n", + " \"mysql+mysqlconnector://root:divya2025@localhost:3306/sakila\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "476cb1ed", + "metadata": {}, + "source": [ + "Tesing the connection" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "fa01a84b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_dateinventory_idcustomer_idreturn_datestaff_idlast_update
012005-05-24 22:53:303671302005-05-26 22:04:3012006-02-15 21:30:53
122005-05-24 22:54:3315254592005-05-28 19:40:3312006-02-15 21:30:53
232005-05-24 23:03:3917114082005-06-01 22:12:3912006-02-15 21:30:53
342005-05-24 23:04:4124523332005-06-03 01:43:4122006-02-15 21:30:53
452005-05-24 23:05:2120792222005-06-02 04:33:2112006-02-15 21:30:53
\n", + "
" + ], + "text/plain": [ + " rental_id rental_date inventory_id customer_id \\\n", + "0 1 2005-05-24 22:53:30 367 130 \n", + "1 2 2005-05-24 22:54:33 1525 459 \n", + "2 3 2005-05-24 23:03:39 1711 408 \n", + "3 4 2005-05-24 23:04:41 2452 333 \n", + "4 5 2005-05-24 23:05:21 2079 222 \n", + "\n", + " return_date staff_id last_update \n", + "0 2005-05-26 22:04:30 1 2006-02-15 21:30:53 \n", + "1 2005-05-28 19:40:33 1 2006-02-15 21:30:53 \n", + "2 2005-06-01 22:12:39 1 2006-02-15 21:30:53 \n", + "3 2005-06-03 01:43:41 2 2006-02-15 21:30:53 \n", + "4 2005-06-02 04:33:21 1 2006-02-15 21:30:53 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"SELECT * FROM rental LIMIT 5;\"\n", + "df_test = pd.read_sql(query, engine)\n", + "df_test" + ] + }, + { + "cell_type": "markdown", + "id": "36d092b2", + "metadata": {}, + "source": [ + "Loading the dataframe for functions" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ef5b5b10", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_sql(query, engine)" + ] + }, + { + "cell_type": "markdown", + "id": "3d7f28e0", + "metadata": {}, + "source": [ + "Challenge - 2" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d4164e13", + "metadata": {}, + "outputs": [], + "source": [ + "def rentals_month(engine, month, year):\n", + " query = f\"\"\"\n", + " SELECT * FROM rental\n", + " WHERE MONTH(rental_date) = {month}\n", + " AND YEAR(rental_date) = {year};\n", + " \"\"\"\n", + "\n", + " df = pd.read_sql(query, engine) #df is loaded inside the function to avoid a global variable\n", + " return df " + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "d7356a96", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_dateinventory_idcustomer_idreturn_datestaff_idlast_update
012005-05-24 22:53:303671302005-05-26 22:04:3012006-02-15 21:30:53
122005-05-24 22:54:3315254592005-05-28 19:40:3312006-02-15 21:30:53
232005-05-24 23:03:3917114082005-06-01 22:12:3912006-02-15 21:30:53
342005-05-24 23:04:4124523332005-06-03 01:43:4122006-02-15 21:30:53
452005-05-24 23:05:2120792222005-06-02 04:33:2112006-02-15 21:30:53
........................
115111532005-05-31 21:36:4427255062005-06-10 01:26:4422006-02-15 21:30:53
115211542005-05-31 21:42:092732592005-06-08 16:40:0912006-02-15 21:30:53
115311552005-05-31 22:17:1120482512005-06-04 20:27:1122006-02-15 21:30:53
115411562005-05-31 22:37:344601062005-06-01 23:02:3422006-02-15 21:30:53
115511572005-05-31 22:47:451449612005-06-02 18:01:4512006-02-15 21:30:53
\n", + "

1156 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " rental_id rental_date inventory_id customer_id \\\n", + "0 1 2005-05-24 22:53:30 367 130 \n", + "1 2 2005-05-24 22:54:33 1525 459 \n", + "2 3 2005-05-24 23:03:39 1711 408 \n", + "3 4 2005-05-24 23:04:41 2452 333 \n", + "4 5 2005-05-24 23:05:21 2079 222 \n", + "... ... ... ... ... \n", + "1151 1153 2005-05-31 21:36:44 2725 506 \n", + "1152 1154 2005-05-31 21:42:09 2732 59 \n", + "1153 1155 2005-05-31 22:17:11 2048 251 \n", + "1154 1156 2005-05-31 22:37:34 460 106 \n", + "1155 1157 2005-05-31 22:47:45 1449 61 \n", + "\n", + " return_date staff_id last_update \n", + "0 2005-05-26 22:04:30 1 2006-02-15 21:30:53 \n", + "1 2005-05-28 19:40:33 1 2006-02-15 21:30:53 \n", + "2 2005-06-01 22:12:39 1 2006-02-15 21:30:53 \n", + "3 2005-06-03 01:43:41 2 2006-02-15 21:30:53 \n", + "4 2005-06-02 04:33:21 1 2006-02-15 21:30:53 \n", + "... ... ... ... \n", + "1151 2005-06-10 01:26:44 2 2006-02-15 21:30:53 \n", + "1152 2005-06-08 16:40:09 1 2006-02-15 21:30:53 \n", + "1153 2005-06-04 20:27:11 2 2006-02-15 21:30:53 \n", + "1154 2005-06-01 23:02:34 2 2006-02-15 21:30:53 \n", + "1155 2005-06-02 18:01:45 1 2006-02-15 21:30:53 \n", + "\n", + "[1156 rows x 7 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "may_df = rentals_month(engine, 5, 2005)\n", + "may_df" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "65958955", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_dateinventory_idcustomer_idreturn_datestaff_idlast_update
011582005-06-14 22:53:3316324162005-06-18 21:37:3322006-02-15 21:30:53
111592005-06-14 22:55:1343955162005-06-17 02:11:1312006-02-15 21:30:53
211602005-06-14 23:00:3427952392005-06-18 01:58:3422006-02-15 21:30:53
311612005-06-14 23:07:0816902852005-06-21 17:12:0812006-02-15 21:30:53
411622005-06-14 23:09:389873102005-06-23 22:00:3812006-02-15 21:30:53
........................
230634652005-06-21 22:10:0114885102005-06-30 21:35:0112006-02-15 21:30:53
230734662005-06-21 22:13:333712262005-06-25 21:01:3322006-02-15 21:30:53
230834672005-06-21 22:19:257295432005-06-27 00:03:2522006-02-15 21:30:53
230934682005-06-21 22:43:4528991002005-06-30 01:49:4512006-02-15 21:30:53
231034692005-06-21 22:48:5940871812005-06-28 19:32:5912006-02-15 21:30:53
\n", + "

2311 rows × 7 columns

\n", + "
" + ], + "text/plain": [ + " rental_id rental_date inventory_id customer_id \\\n", + "0 1158 2005-06-14 22:53:33 1632 416 \n", + "1 1159 2005-06-14 22:55:13 4395 516 \n", + "2 1160 2005-06-14 23:00:34 2795 239 \n", + "3 1161 2005-06-14 23:07:08 1690 285 \n", + "4 1162 2005-06-14 23:09:38 987 310 \n", + "... ... ... ... ... \n", + "2306 3465 2005-06-21 22:10:01 1488 510 \n", + "2307 3466 2005-06-21 22:13:33 371 226 \n", + "2308 3467 2005-06-21 22:19:25 729 543 \n", + "2309 3468 2005-06-21 22:43:45 2899 100 \n", + "2310 3469 2005-06-21 22:48:59 4087 181 \n", + "\n", + " return_date staff_id last_update \n", + "0 2005-06-18 21:37:33 2 2006-02-15 21:30:53 \n", + "1 2005-06-17 02:11:13 1 2006-02-15 21:30:53 \n", + "2 2005-06-18 01:58:34 2 2006-02-15 21:30:53 \n", + "3 2005-06-21 17:12:08 1 2006-02-15 21:30:53 \n", + "4 2005-06-23 22:00:38 1 2006-02-15 21:30:53 \n", + "... ... ... ... \n", + "2306 2005-06-30 21:35:01 1 2006-02-15 21:30:53 \n", + "2307 2005-06-25 21:01:33 2 2006-02-15 21:30:53 \n", + "2308 2005-06-27 00:03:25 2 2006-02-15 21:30:53 \n", + "2309 2005-06-30 01:49:45 1 2006-02-15 21:30:53 \n", + "2310 2005-06-28 19:32:59 1 2006-02-15 21:30:53 \n", + "\n", + "[2311 rows x 7 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "june_df = rentals_month(engine, 6, 2005)\n", + "june_df" + ] + }, + { + "cell_type": "markdown", + "id": "5021ad98", + "metadata": {}, + "source": [ + "Challenge - 3" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "471c0cea", + "metadata": {}, + "outputs": [], + "source": [ + "def rental_count_month (df, month, year):\n", + " column_name = f\"rentals_{month:02d}_{year}\" # since column name as per instructions should be rentals_month number_year\n", + "\n", + " result = (df.groupby(\"customer_id\")[\"rental_id\"].count().reset_index(name=column_name))\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "1b80daf3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005
012
121
232
353
463
.........
5155944
5165951
5175966
5185972
5195991
\n", + "

520 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005\n", + "0 1 2\n", + "1 2 1\n", + "2 3 2\n", + "3 5 3\n", + "4 6 3\n", + ".. ... ...\n", + "515 594 4\n", + "516 595 1\n", + "517 596 6\n", + "518 597 2\n", + "519 599 1\n", + "\n", + "[520 rows x 2 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "may_count = rental_count_month(may_df, 5, 2005)\n", + "may_count" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "fc45ae1f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_06_2005
017
121
234
346
455
.........
5855952
5865962
5875973
5885981
5895994
\n", + "

590 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " customer_id rentals_06_2005\n", + "0 1 7\n", + "1 2 1\n", + "2 3 4\n", + "3 4 6\n", + "4 5 5\n", + ".. ... ...\n", + "585 595 2\n", + "586 596 2\n", + "587 597 3\n", + "588 598 1\n", + "589 599 4\n", + "\n", + "[590 rows x 2 columns]" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "june_count = rental_count_month(june_df, 6, 2005)\n", + "june_count" + ] + }, + { + "cell_type": "markdown", + "id": "1275699b", + "metadata": {}, + "source": [ + "Create a Python function called compare_rentals that takes two DataFrames as input containing the number of rentals made by each customer in different months and years. The function should return a combined DataFrame with a new 'difference' column, which is the difference between the number of rentals in the two months." + ] + }, + { + "cell_type": "markdown", + "id": "769fc709", + "metadata": {}, + "source": [ + "Challenge - 4" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "cb19f7a7", + "metadata": {}, + "outputs": [], + "source": [ + "def compare_rentals(df1, df2):\n", + " merged = df1.merge(df2, on=\"customer_id\", how =\"inner\") #merging both dfs to find out the difference, only for customers who are present in both months (inner join)\n", + " rental_columns = merged.columns.drop(\"customer_id\")\n", + " merged[\"difference\"] = merged[rental_columns[1]] - merged[rental_columns[0]]\n", + " return merged\n" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "9c765aad", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005rentals_06_2005difference
01275
12110
23242
35352
46341
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "0 1 2 7 5\n", + "1 2 1 1 0\n", + "2 3 2 4 2\n", + "3 5 3 5 2\n", + "4 6 3 4 1" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "may_df = rentals_month(engine, 5, 2005)\n", + "june_df = rentals_month(engine, 6, 2005)\n", + "\n", + "may_counts = rental_count_month(may_df, 5, 2005)\n", + "june_counts = rental_count_month(june_df, 6, 2005)\n", + "\n", + "comparison = compare_rentals(may_counts, june_counts)\n", + "comparison.head()\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}