diff --git a/lab-sql-python-connection.ipynb b/lab-sql-python-connection.ipynb new file mode 100644 index 0000000..0f1233b --- /dev/null +++ b/lab-sql-python-connection.ipynb @@ -0,0 +1,236 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "ffa2666e-22a6-4710-a3b7-afbecebb0090", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Database connection engine created successfully.\n", + "\n", + "--- Starting Data Retrieval and Analysis ---\n", + "1. Raw rental data retrieved. May: 1156 records. June: 2311 records.\n", + "2. Rental counts calculated. Customers in May: 520. Customers in June: 590.\n", + "\n", + "--- FINAL CUSTOMER ACTIVITY REPORT (May vs. June) ---\n", + "Total customers active in both months: 512\n", + "\n", + "Top 10 Customers by Rental Increase (June - May):\n", + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "452 454 1.0 10.0 9.0\n", + "455 457 1.0 9.0 8.0\n", + "211 213 1.0 9.0 8.0\n", + "293 295 1.0 9.0 8.0\n", + "26 27 1.0 8.0 7.0\n", + "232 234 1.0 8.0 7.0\n", + "258 260 1.0 8.0 7.0\n", + "378 380 1.0 8.0 7.0\n", + "559 561 2.0 9.0 7.0\n", + "404 406 1.0 7.0 6.0\n", + "\n", + "Bottom 5 Customers by Rental Decrease (June - May):\n", + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "196 198 5.0 1.0 -4.0\n", + "272 274 6.0 2.0 -4.0\n", + "248 250 5.0 1.0 -4.0\n", + "594 596 6.0 2.0 -4.0\n", + "205 207 6.0 1.0 -5.0\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from sqlalchemy import create_engine\n", + "from typing import Literal\n", + "\n", + "# --- 1. DATABASE CONNECTION CONFIGURATION ---\n", + "# IMPORTANT: Replace 'user', 'password', and 'host' with your actual MySQL credentials.\n", + "DB_USER = 'root'\n", + "DB_PASSWORD = '5884123695476'\n", + "DB_HOST = 'localhost'\n", + "DB_NAME = 'sakila'\n", + "\n", + "# Create the SQLAlchemy engine for database connection\n", + "try:\n", + " engine = create_engine(f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}')\n", + " print(\"Database connection engine created successfully.\")\n", + "except ImportError:\n", + " print(\"ERROR: Required libraries 'sqlalchemy', 'pandas', or 'pymysql' might be missing.\")\n", + " print(\"Please run: pip install sqlalchemy pandas pymysql\")\n", + " exit()\n", + "\n", + "# ======================================================================\n", + "# 1. FUNCTION: rentals_month\n", + "# ======================================================================\n", + "\n", + "def rentals_month(engine: create_engine, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Retrieves rental data for a specific month and year from the Sakila 'rental' table.\n", + "\n", + " Args:\n", + " engine: The SQLAlchemy engine object for the database connection.\n", + " month: The month (integer) to filter the data.\n", + " year: The year (integer) to filter the data.\n", + "\n", + " Returns:\n", + " A pandas DataFrame containing the filtered rental data (rental_id, customer_id, rental_date).\n", + " \"\"\"\n", + " # SQL query uses MONTH() and YEAR() functions to filter the rental_date column.\n", + " query = f\"\"\"\n", + " SELECT\n", + " rental_id,\n", + " customer_id,\n", + " rental_date\n", + " FROM\n", + " rental\n", + " WHERE\n", + " YEAR(rental_date) = {year} AND MONTH(rental_date) = {month};\n", + " \"\"\"\n", + "\n", + " # Read the data directly into a Pandas DataFrame\n", + " df_rentals = pd.read_sql(query, engine)\n", + " \n", + " return df_rentals\n", + "\n", + "# ======================================================================\n", + "# 2. FUNCTION: rental_count_month\n", + "# ======================================================================\n", + "\n", + "def rental_count_month(df_rentals: pd.DataFrame, month: int, year: int) -> pd.DataFrame:\n", + " \"\"\"\n", + " Calculates the number of rentals made by each customer_id during the selected month and year.\n", + "\n", + " Args:\n", + " df_rentals: DataFrame containing rental records (output from rentals_month).\n", + " month: The month (integer).\n", + " year: The year (integer).\n", + "\n", + " Returns:\n", + " A new DataFrame with 'customer_id' and the rental count column, named 'rentals_MM_YYYY'.\n", + " \"\"\"\n", + " \n", + " # Group the data by 'customer_id' and count the number of rentals (rental_id)\n", + " df_count = df_rentals.groupby('customer_id')['rental_id'].count().reset_index()\n", + " \n", + " # Format the month/year for the new column name (e.g., 5 -> '05')\n", + " month_str = str(month).zfill(2)\n", + " year_str = str(year)\n", + " new_column_name = f\"rentals_{month_str}_{year_str}\"\n", + " \n", + " # Rename the rental_id count column to the required format\n", + " df_count = df_count.rename(columns={'rental_id': new_column_name})\n", + " \n", + " return df_count\n", + "\n", + "# ======================================================================\n", + "# 3. FUNCTION: compare_rentals\n", + "# ======================================================================\n", + "\n", + "def compare_rentals(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:\n", + " \"\"\"\n", + " Compares the rental counts of two months per customer.\n", + "\n", + " Args:\n", + " df1: DataFrame containing rental counts for the first month (e.g., May).\n", + " df2: DataFrame containing rental counts for the second month (e.g., June).\n", + "\n", + " Returns:\n", + " A combined DataFrame with an added 'difference' column (df2_rentals - df1_rentals).\n", + " \"\"\"\n", + " \n", + " # Identify the rental count column names dynamically (the second column in each DataFrame)\n", + " col1 = df1.columns[1]\n", + " col2 = df2.columns[1]\n", + "\n", + " # Merge the two DataFrames on 'customer_id' using an 'outer' join to keep all customers\n", + " df_combined = pd.merge(df1, df2, on='customer_id', how='outer')\n", + " \n", + " # Fill NaN values with 0, as a NaN count means the customer made 0 rentals in that month.\n", + " df_combined = df_combined.fillna(0)\n", + " \n", + " # Calculate the 'difference' column (Month 2 - Month 1)\n", + " df_combined['difference'] = df_combined[col2] - df_combined[col1]\n", + " \n", + " return df_combined\n", + "\n", + "# ======================================================================\n", + "# 4. SCRIPT EXECUTION\n", + "# ======================================================================\n", + "\n", + "# Define the months of interest (May and June 2005)\n", + "MAY_MONTH = 5\n", + "MAY_YEAR = 2005\n", + "JUNE_MONTH = 6\n", + "JUNE_YEAR = 2005\n", + "\n", + "print(\"\\n--- Starting Data Retrieval and Analysis ---\")\n", + "\n", + "# Step 1: Retrieve raw data for May and June\n", + "df_may_raw = rentals_month(engine, MAY_MONTH, MAY_YEAR)\n", + "df_jun_raw = rentals_month(engine, JUNE_MONTH, JUNE_YEAR)\n", + "print(f\"1. Raw rental data retrieved. May: {len(df_may_raw)} records. June: {len(df_jun_raw)} records.\")\n", + "\n", + "\n", + "# Step 2: Count rentals per customer for each month\n", + "df_may_count = rental_count_month(df_may_raw, MAY_MONTH, MAY_YEAR)\n", + "df_jun_count = rental_count_month(df_jun_raw, JUNE_MONTH, JUNE_YEAR)\n", + "print(f\"2. Rental counts calculated. Customers in May: {len(df_may_count)}. Customers in June: {len(df_jun_count)}.\")\n", + "\n", + "\n", + "# Step 3: Compare and calculate the difference\n", + "df_final_report = compare_rentals(df_may_count, df_jun_count)\n", + "\n", + "\n", + "# Filter for customers who were active in BOTH May AND June\n", + "may_col_name = f\"rentals_{str(MAY_MONTH).zfill(2)}_{MAY_YEAR}\"\n", + "jun_col_name = f\"rentals_{str(JUNE_MONTH).zfill(2)}_{JUNE_YEAR}\"\n", + "\n", + "df_active_both = df_final_report[\n", + " (df_final_report[may_col_name] > 0) & \n", + " (df_final_report[jun_col_name] > 0)\n", + "].sort_values(by='difference', ascending=False)\n", + "\n", + "\n", + "print(\"\\n--- FINAL CUSTOMER ACTIVITY REPORT (May vs. June) ---\")\n", + "print(f\"Total customers active in both months: {len(df_active_both)}\")\n", + "print(\"\\nTop 10 Customers by Rental Increase (June - May):\")\n", + "print(df_active_both.head(10))\n", + "print(\"\\nBottom 5 Customers by Rental Decrease (June - May):\")\n", + "print(df_active_both.tail())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71e21186-790d-418f-ae92-5ba09fb035fa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:base] *", + "language": "python", + "name": "conda-base-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}