diff --git a/sql-python-connection.ipynb b/sql-python-connection.ipynb new file mode 100644 index 0000000..adae7cb --- /dev/null +++ b/sql-python-connection.ipynb @@ -0,0 +1,487 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 3, + "id": "c3d411ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: sqlalchemy in /opt/anaconda3/lib/python3.13/site-packages (2.0.39)\n", + "Requirement already satisfied: pymysql in /opt/anaconda3/lib/python3.13/site-packages (1.1.2)\n", + "Requirement already satisfied: pandas in /opt/anaconda3/lib/python3.13/site-packages (2.2.3)\n", + "Requirement already satisfied: typing-extensions>=4.6.0 in /opt/anaconda3/lib/python3.13/site-packages (from sqlalchemy) (4.12.2)\n", + "Requirement already satisfied: numpy>=1.26.0 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2.1.3)\n", + "Requirement already satisfied: python-dateutil>=2.8.2 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2.9.0.post0)\n", + "Requirement already satisfied: pytz>=2020.1 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2024.1)\n", + "Requirement already satisfied: tzdata>=2022.7 in /opt/anaconda3/lib/python3.13/site-packages (from pandas) (2025.2)\n", + "Requirement already satisfied: six>=1.5 in /opt/anaconda3/lib/python3.13/site-packages (from python-dateutil>=2.8.2->pandas) (1.17.0)\n" + ] + } + ], + "source": [ + "!pip install sqlalchemy pymysql pandas" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d9d299cc", + "metadata": {}, + "outputs": [], + "source": [ + "from sqlalchemy import create_engine\n", + "import pandas as pd\n", + "\n", + "USER = \"\"\n", + "PASSWORD = \"\" \n", + "HOST = \"127.0.0.1\"\n", + "PORT = 3306\n", + "DB = \"sakila\"\n", + "\n", + "engine = create_engine(f\"mysql+pymysql://{USER}:{PASSWORD}@{HOST}:{PORT}/{DB}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "74c82b84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
1
01
\n", + "
" + ], + "text/plain": [ + " 1\n", + "0 1" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.read_sql(\"SELECT 1;\", engine)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d0e81744", + "metadata": {}, + "outputs": [], + "source": [ + "def rentals_month(engine, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Return rental records for a given month/year from sakila.rental as a DataFrame.\n", + " \"\"\"\n", + " query = \"\"\"\n", + " SELECT rental_id, rental_date, inventory_id, customer_id, staff_id, return_date\n", + " FROM rental\n", + " WHERE MONTH(rental_date) = %(month)s\n", + " AND YEAR(rental_date) = %(year)s;\n", + " \"\"\"\n", + " return pd.read_sql(query, engine, params={\"month\": month, \"year\": year})" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2a37550d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "( rental_id rental_date inventory_id customer_id staff_id \\\n", + " 0 1 2005-05-24 22:53:30 367 130 1 \n", + " 1 2 2005-05-24 22:54:33 1525 459 1 \n", + " 2 3 2005-05-24 23:03:39 1711 408 1 \n", + " 3 4 2005-05-24 23:04:41 2452 333 2 \n", + " 4 5 2005-05-24 23:05:21 2079 222 1 \n", + " \n", + " return_date \n", + " 0 2005-05-26 22:04:30 \n", + " 1 2005-05-28 19:40:33 \n", + " 2 2005-06-01 22:12:39 \n", + " 3 2005-06-03 01:43:41 \n", + " 4 2005-06-02 04:33:21 ,\n", + " rental_id rental_date inventory_id customer_id staff_id \\\n", + " 0 1158 2005-06-14 22:53:33 1632 416 2 \n", + " 1 1159 2005-06-14 22:55:13 4395 516 1 \n", + " 2 1160 2005-06-14 23:00:34 2795 239 2 \n", + " 3 1161 2005-06-14 23:07:08 1690 285 1 \n", + " 4 1162 2005-06-14 23:09:38 987 310 1 \n", + " \n", + " return_date \n", + " 0 2005-06-18 21:37:33 \n", + " 1 2005-06-17 02:11:13 \n", + " 2 2005-06-18 01:58:34 \n", + " 3 2005-06-21 17:12:08 \n", + " 4 2005-06-23 22:00:38 )" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "may_df = rentals_month(engine, 5, 2005)\n", + "june_df = rentals_month(engine, 6, 2005)\n", + "\n", + "may_df.head(), june_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9d2f5782", + "metadata": {}, + "outputs": [], + "source": [ + "def rental_count_month(rentals_df: pd.DataFrame, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Given rentals dataframe for a month/year, return a dataframe:\n", + " customer_id | rentals_MM_YYYY\n", + " \"\"\"\n", + " col_name = f\"rentals_{month:02d}_{year}\"\n", + " \n", + " counts = (\n", + " rentals_df.groupby(\"customer_id\")\n", + " .size()\n", + " .reset_index(name=col_name)\n", + " )\n", + " return counts" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "5df92360", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "( customer_id rentals_05_2005\n", + " 0 1 2\n", + " 1 2 1\n", + " 2 3 2\n", + " 3 5 3\n", + " 4 6 3,\n", + " customer_id rentals_06_2005\n", + " 0 1 7\n", + " 1 2 1\n", + " 2 3 4\n", + " 3 4 6\n", + " 4 5 5)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "may_counts = rental_count_month(may_df, 5, 2005)\n", + "june_counts = rental_count_month(june_df, 6, 2005)\n", + "\n", + "may_counts.head(), june_counts.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "07f62be3", + "metadata": {}, + "outputs": [], + "source": [ + "def compare_rentals(counts_df_1: pd.DataFrame, counts_df_2: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Join two monthly rental count dataframes on customer_id and add a difference column:\n", + " difference = month2 - month1\n", + " \"\"\"\n", + " # Identify the month columns (everything except customer_id)\n", + " month_col_1 = [c for c in counts_df_1.columns if c != \"customer_id\"][0]\n", + " month_col_2 = [c for c in counts_df_2.columns if c != \"customer_id\"][0]\n", + "\n", + " merged = counts_df_1.merge(counts_df_2, on=\"customer_id\", how=\"inner\")\n", + " merged[\"difference\"] = merged[month_col_2] - merged[month_col_1]\n", + " return merged" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8e17ba0b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005rentals_06_2005difference
01275
12110
23242
35352
46341
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "0 1 2 7 5\n", + "1 2 1 1 0\n", + "2 3 2 4 2\n", + "3 5 3 5 2\n", + "4 6 3 4 1" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comparison = compare_rentals(may_counts, june_counts)\n", + "comparison.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "01da1e3f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idfirst_namelast_nameemailrentals_05_2005rentals_06_2005difference
01MARYSMITHMARY.SMITH@sakilacustomer.org275
12PATRICIAJOHNSONPATRICIA.JOHNSON@sakilacustomer.org110
23LINDAWILLIAMSLINDA.WILLIAMS@sakilacustomer.org242
35ELIZABETHBROWNELIZABETH.BROWN@sakilacustomer.org352
46JENNIFERDAVISJENNIFER.DAVIS@sakilacustomer.org341
\n", + "
" + ], + "text/plain": [ + " customer_id first_name last_name email \\\n", + "0 1 MARY SMITH MARY.SMITH@sakilacustomer.org \n", + "1 2 PATRICIA JOHNSON PATRICIA.JOHNSON@sakilacustomer.org \n", + "2 3 LINDA WILLIAMS LINDA.WILLIAMS@sakilacustomer.org \n", + "3 5 ELIZABETH BROWN ELIZABETH.BROWN@sakilacustomer.org \n", + "4 6 JENNIFER DAVIS JENNIFER.DAVIS@sakilacustomer.org \n", + "\n", + " rentals_05_2005 rentals_06_2005 difference \n", + "0 2 7 5 \n", + "1 1 1 0 \n", + "2 2 4 2 \n", + "3 3 5 2 \n", + "4 3 4 1 " + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "customer_info = pd.read_sql(\n", + " \"SELECT customer_id, first_name, last_name, email FROM customer;\",\n", + " engine\n", + ")\n", + "\n", + "final = comparison.merge(customer_info, on=\"customer_id\", how=\"left\")\n", + "final = final[[\"customer_id\", \"first_name\", \"last_name\", \"email\"] + \n", + " [c for c in comparison.columns if c != \"customer_id\"]]\n", + "\n", + "final.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}