Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
236 changes: 236 additions & 0 deletions lab-sql-python-connection.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "ffa2666e-22a6-4710-a3b7-afbecebb0090",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Database connection engine created successfully.\n",
"\n",
"--- Starting Data Retrieval and Analysis ---\n",
"1. Raw rental data retrieved. May: 1156 records. June: 2311 records.\n",
"2. Rental counts calculated. Customers in May: 520. Customers in June: 590.\n",
"\n",
"--- FINAL CUSTOMER ACTIVITY REPORT (May vs. June) ---\n",
"Total customers active in both months: 512\n",
"\n",
"Top 10 Customers by Rental Increase (June - May):\n",
" customer_id rentals_05_2005 rentals_06_2005 difference\n",
"452 454 1.0 10.0 9.0\n",
"455 457 1.0 9.0 8.0\n",
"211 213 1.0 9.0 8.0\n",
"293 295 1.0 9.0 8.0\n",
"26 27 1.0 8.0 7.0\n",
"232 234 1.0 8.0 7.0\n",
"258 260 1.0 8.0 7.0\n",
"378 380 1.0 8.0 7.0\n",
"559 561 2.0 9.0 7.0\n",
"404 406 1.0 7.0 6.0\n",
"\n",
"Bottom 5 Customers by Rental Decrease (June - May):\n",
" customer_id rentals_05_2005 rentals_06_2005 difference\n",
"196 198 5.0 1.0 -4.0\n",
"272 274 6.0 2.0 -4.0\n",
"248 250 5.0 1.0 -4.0\n",
"594 596 6.0 2.0 -4.0\n",
"205 207 6.0 1.0 -5.0\n"
]
}
],
"source": [
"import pandas as pd\n",
"from sqlalchemy import create_engine\n",
"from typing import Literal\n",
"\n",
"# --- 1. DATABASE CONNECTION CONFIGURATION ---\n",
"# IMPORTANT: Replace 'user', 'password', and 'host' with your actual MySQL credentials.\n",
"DB_USER = 'root'\n",
"DB_PASSWORD = '5884123695476'\n",
"DB_HOST = 'localhost'\n",
"DB_NAME = 'sakila'\n",
"\n",
"# Create the SQLAlchemy engine for database connection\n",
"try:\n",
" engine = create_engine(f'mysql+pymysql://{DB_USER}:{DB_PASSWORD}@{DB_HOST}/{DB_NAME}')\n",
" print(\"Database connection engine created successfully.\")\n",
"except ImportError:\n",
" print(\"ERROR: Required libraries 'sqlalchemy', 'pandas', or 'pymysql' might be missing.\")\n",
" print(\"Please run: pip install sqlalchemy pandas pymysql\")\n",
" exit()\n",
"\n",
"# ======================================================================\n",
"# 1. FUNCTION: rentals_month\n",
"# ======================================================================\n",
"\n",
"def rentals_month(engine: create_engine, month: int, year: int) -> pd.DataFrame:\n",
" \"\"\"\n",
" Retrieves rental data for a specific month and year from the Sakila 'rental' table.\n",
"\n",
" Args:\n",
" engine: The SQLAlchemy engine object for the database connection.\n",
" month: The month (integer) to filter the data.\n",
" year: The year (integer) to filter the data.\n",
"\n",
" Returns:\n",
" A pandas DataFrame containing the filtered rental data (rental_id, customer_id, rental_date).\n",
" \"\"\"\n",
" # SQL query uses MONTH() and YEAR() functions to filter the rental_date column.\n",
" query = f\"\"\"\n",
" SELECT\n",
" rental_id,\n",
" customer_id,\n",
" rental_date\n",
" FROM\n",
" rental\n",
" WHERE\n",
" YEAR(rental_date) = {year} AND MONTH(rental_date) = {month};\n",
" \"\"\"\n",
"\n",
" # Read the data directly into a Pandas DataFrame\n",
" df_rentals = pd.read_sql(query, engine)\n",
" \n",
" return df_rentals\n",
"\n",
"# ======================================================================\n",
"# 2. FUNCTION: rental_count_month\n",
"# ======================================================================\n",
"\n",
"def rental_count_month(df_rentals: pd.DataFrame, month: int, year: int) -> pd.DataFrame:\n",
" \"\"\"\n",
" Calculates the number of rentals made by each customer_id during the selected month and year.\n",
"\n",
" Args:\n",
" df_rentals: DataFrame containing rental records (output from rentals_month).\n",
" month: The month (integer).\n",
" year: The year (integer).\n",
"\n",
" Returns:\n",
" A new DataFrame with 'customer_id' and the rental count column, named 'rentals_MM_YYYY'.\n",
" \"\"\"\n",
" \n",
" # Group the data by 'customer_id' and count the number of rentals (rental_id)\n",
" df_count = df_rentals.groupby('customer_id')['rental_id'].count().reset_index()\n",
" \n",
" # Format the month/year for the new column name (e.g., 5 -> '05')\n",
" month_str = str(month).zfill(2)\n",
" year_str = str(year)\n",
" new_column_name = f\"rentals_{month_str}_{year_str}\"\n",
" \n",
" # Rename the rental_id count column to the required format\n",
" df_count = df_count.rename(columns={'rental_id': new_column_name})\n",
" \n",
" return df_count\n",
"\n",
"# ======================================================================\n",
"# 3. FUNCTION: compare_rentals\n",
"# ======================================================================\n",
"\n",
"def compare_rentals(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:\n",
" \"\"\"\n",
" Compares the rental counts of two months per customer.\n",
"\n",
" Args:\n",
" df1: DataFrame containing rental counts for the first month (e.g., May).\n",
" df2: DataFrame containing rental counts for the second month (e.g., June).\n",
"\n",
" Returns:\n",
" A combined DataFrame with an added 'difference' column (df2_rentals - df1_rentals).\n",
" \"\"\"\n",
" \n",
" # Identify the rental count column names dynamically (the second column in each DataFrame)\n",
" col1 = df1.columns[1]\n",
" col2 = df2.columns[1]\n",
"\n",
" # Merge the two DataFrames on 'customer_id' using an 'outer' join to keep all customers\n",
" df_combined = pd.merge(df1, df2, on='customer_id', how='outer')\n",
" \n",
" # Fill NaN values with 0, as a NaN count means the customer made 0 rentals in that month.\n",
" df_combined = df_combined.fillna(0)\n",
" \n",
" # Calculate the 'difference' column (Month 2 - Month 1)\n",
" df_combined['difference'] = df_combined[col2] - df_combined[col1]\n",
" \n",
" return df_combined\n",
"\n",
"# ======================================================================\n",
"# 4. SCRIPT EXECUTION\n",
"# ======================================================================\n",
"\n",
"# Define the months of interest (May and June 2005)\n",
"MAY_MONTH = 5\n",
"MAY_YEAR = 2005\n",
"JUNE_MONTH = 6\n",
"JUNE_YEAR = 2005\n",
"\n",
"print(\"\\n--- Starting Data Retrieval and Analysis ---\")\n",
"\n",
"# Step 1: Retrieve raw data for May and June\n",
"df_may_raw = rentals_month(engine, MAY_MONTH, MAY_YEAR)\n",
"df_jun_raw = rentals_month(engine, JUNE_MONTH, JUNE_YEAR)\n",
"print(f\"1. Raw rental data retrieved. May: {len(df_may_raw)} records. June: {len(df_jun_raw)} records.\")\n",
"\n",
"\n",
"# Step 2: Count rentals per customer for each month\n",
"df_may_count = rental_count_month(df_may_raw, MAY_MONTH, MAY_YEAR)\n",
"df_jun_count = rental_count_month(df_jun_raw, JUNE_MONTH, JUNE_YEAR)\n",
"print(f\"2. Rental counts calculated. Customers in May: {len(df_may_count)}. Customers in June: {len(df_jun_count)}.\")\n",
"\n",
"\n",
"# Step 3: Compare and calculate the difference\n",
"df_final_report = compare_rentals(df_may_count, df_jun_count)\n",
"\n",
"\n",
"# Filter for customers who were active in BOTH May AND June\n",
"may_col_name = f\"rentals_{str(MAY_MONTH).zfill(2)}_{MAY_YEAR}\"\n",
"jun_col_name = f\"rentals_{str(JUNE_MONTH).zfill(2)}_{JUNE_YEAR}\"\n",
"\n",
"df_active_both = df_final_report[\n",
" (df_final_report[may_col_name] > 0) & \n",
" (df_final_report[jun_col_name] > 0)\n",
"].sort_values(by='difference', ascending=False)\n",
"\n",
"\n",
"print(\"\\n--- FINAL CUSTOMER ACTIVITY REPORT (May vs. June) ---\")\n",
"print(f\"Total customers active in both months: {len(df_active_both)}\")\n",
"print(\"\\nTop 10 Customers by Rental Increase (June - May):\")\n",
"print(df_active_both.head(10))\n",
"print(\"\\nBottom 5 Customers by Rental Decrease (June - May):\")\n",
"print(df_active_both.tail())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "71e21186-790d-418f-ae92-5ba09fb035fa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python [conda env:base] *",
"language": "python",
"name": "conda-base-py"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}