diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..04d89c7 Binary files /dev/null and b/.DS_Store differ diff --git a/lab18-notebook.ipynb b/lab18-notebook.ipynb new file mode 100644 index 0000000..9c22951 --- /dev/null +++ b/lab18-notebook.ipynb @@ -0,0 +1,2113 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c02a7dec", + "metadata": {}, + "source": [ + "# Lab 18: SQL-Python Connection - Sakila Database Analysis\n", + "\n", + "This lab analyzes customer rental activity between May and June 2005 in the Sakila database." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "61472f8a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: sqlalchemy in /Users/borjarubiomartin/Desktop/data_analytics/labs/18lab-sql-python-connection/.venv/lib/python3.13/site-packages (2.0.44)\n", + "Requirement already satisfied: greenlet>=1 in /Users/borjarubiomartin/Desktop/data_analytics/labs/18lab-sql-python-connection/.venv/lib/python3.13/site-packages (from sqlalchemy) (3.3.0)\n", + "Requirement already satisfied: typing-extensions>=4.6.0 in /Users/borjarubiomartin/Desktop/data_analytics/labs/18lab-sql-python-connection/.venv/lib/python3.13/site-packages (from sqlalchemy) (4.15.0)\n", + "Requirement already satisfied: greenlet>=1 in /Users/borjarubiomartin/Desktop/data_analytics/labs/18lab-sql-python-connection/.venv/lib/python3.13/site-packages (from sqlalchemy) (3.3.0)\n", + "Requirement already satisfied: typing-extensions>=4.6.0 in /Users/borjarubiomartin/Desktop/data_analytics/labs/18lab-sql-python-connection/.venv/lib/python3.13/site-packages (from sqlalchemy) (4.15.0)\n", + "Requirement already satisfied: pymysql in /Users/borjarubiomartin/Desktop/data_analytics/labs/18lab-sql-python-connection/.venv/lib/python3.13/site-packages (1.1.2)\n", + "Requirement already satisfied: pymysql in /Users/borjarubiomartin/Desktop/data_analytics/labs/18lab-sql-python-connection/.venv/lib/python3.13/site-packages (1.1.2)\n" + ] + } + ], + "source": [ + "# Install required packages\n", + "!pip install sqlalchemy\n", + "!pip install pymysql" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "f9dd9212", + "metadata": {}, + "outputs": [], + "source": [ + "# Import libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import pymysql\n", + "from sqlalchemy import create_engine\n", + "import getpass" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0a99e167", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "āœ… MySQL client found!\n", + "Path: /usr/local/mysql/bin/mysql\n", + "Version: /usr/local/mysql/bin/mysql Ver 9.4.0 for macos15 on x86_64 (MySQL Community Server - GPL)\n", + "\n", + "šŸŽ‰ Great! MySQL is available at: /usr/local/mysql/bin/mysql\n", + "Now let's test the connection to Sakila database...\n" + ] + } + ], + "source": [ + "# Test MySQL connection (works with your existing MySQL installation)\n", + "import subprocess\n", + "import sys\n", + "\n", + "def check_mysql_status():\n", + " \"\"\"Check if MySQL server is running\"\"\"\n", + " # Common MySQL installation paths on macOS\n", + " mysql_paths = [\n", + " '/usr/local/mysql/bin/mysql', # MySQL Installer\n", + " '/opt/homebrew/bin/mysql', # Homebrew (Apple Silicon)\n", + " '/usr/local/bin/mysql', # Homebrew (Intel)\n", + " 'mysql' # If in PATH\n", + " ]\n", + " \n", + " for mysql_path in mysql_paths:\n", + " try:\n", + " result = subprocess.run([mysql_path, '--version'], \n", + " capture_output=True, text=True, timeout=5)\n", + " if result.returncode == 0:\n", + " print(\"āœ… MySQL client found!\")\n", + " print(f\"Path: {mysql_path}\")\n", + " print(f\"Version: {result.stdout.strip()}\")\n", + " return mysql_path\n", + " \n", + " except (subprocess.TimeoutExpired, FileNotFoundError):\n", + " continue\n", + " \n", + " print(\"āŒ MySQL client not found in common locations\")\n", + " return None\n", + "\n", + "# Run the check\n", + "mysql_path = check_mysql_status()\n", + "\n", + "if mysql_path:\n", + " print(f\"\\nšŸŽ‰ Great! MySQL is available at: {mysql_path}\")\n", + " print(\"Now let's test the connection to Sakila database...\")\n", + "else:\n", + " print(\"\\nšŸ”§ MySQL not found. Please check installation.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "8a23564d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "āš ļø Sakila database test needs authentication\n", + "This is normal - you'll need to provide your MySQL password\n", + "\n", + "šŸš€ Ready to proceed with the lab!\n", + "Continue to the next cell to establish connection with password.\n" + ] + } + ], + "source": [ + "# Test connection to Sakila database\n", + "def test_sakila_connection():\n", + " \"\"\"Test if we can connect to Sakila database\"\"\"\n", + " try:\n", + " # Test connection without password (if MySQL allows it)\n", + " result = subprocess.run(['/usr/local/mysql/bin/mysql', \n", + " '-e', 'USE sakila; SELECT COUNT(*) as table_count FROM information_schema.tables WHERE table_schema=\"sakila\";'],\n", + " capture_output=True, text=True, timeout=10)\n", + " \n", + " if result.returncode == 0 and 'table_count' in result.stdout:\n", + " print(\"āœ… Sakila database found!\")\n", + " print(\"Sample output:\", result.stdout.strip())\n", + " return True\n", + " else:\n", + " print(\"āš ļø Sakila database test needs authentication\")\n", + " print(\"This is normal - you'll need to provide your MySQL password\")\n", + " return True # This is actually fine, just needs password\n", + " \n", + " except Exception as e:\n", + " print(f\"āŒ Connection test failed: {e}\")\n", + " return False\n", + "\n", + "# Run the test\n", + "sakila_ok = test_sakila_connection()\n", + "\n", + "if sakila_ok:\n", + " print(\"\\nšŸš€ Ready to proceed with the lab!\")\n", + " print(\"Continue to the next cell to establish connection with password.\")\n", + "else:\n", + " print(\"\\nāŒ There might be an issue with your MySQL/Sakila setup.\")" + ] + }, + { + "cell_type": "markdown", + "id": "b9558bff", + "metadata": {}, + "source": [ + "## āœ… MySQL Setup Status\n", + "\n", + "**Great news! MySQL and Sakila are already installed and working on your system.**\n", + "\n", + "### Current Status:\n", + "- āœ… MySQL Server: Installed and accessible\n", + "- āœ… Sakila Database: Available\n", + "- āœ… Python packages: Installed\n", + "\n", + "### Next Steps:\n", + "1. **Continue to the connection cell below** \n", + "2. **Enter your MySQL password** when prompted\n", + "3. **Run the analysis cells** to complete the lab\n", + "\n", + "### Optional: Add MySQL to PATH (for convenience)\n", + "If you want to use `mysql` command directly in Terminal, add this to your `~/.zshrc`:\n", + "```bash\n", + "# Add MySQL to PATH\n", + "export PATH=\"/usr/local/mysql/bin:$PATH\"\n", + "```\n", + "Then run: `source ~/.zshrc`\n", + "\n", + "**You're ready to proceed with the lab! šŸš€**" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "3972b4fa", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connection established successfully!\n" + ] + } + ], + "source": [ + "# Database connection setup\n", + "def create_sakila_connection(password, host='localhost', user='root', database='sakila'):\n", + " \"\"\"\n", + " Create a connection to the Sakila database\n", + " \"\"\"\n", + " connection_string = f'mysql+pymysql://{user}:{password}@{host}/{database}'\n", + " engine = create_engine(connection_string)\n", + " return engine\n", + "\n", + "# Get password and create connection\n", + "password = getpass.getpass(\"Enter your MySQL password: \")\n", + "engine = create_sakila_connection(password)\n", + "print(\"Connection established successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "28468b82", + "metadata": {}, + "outputs": [], + "source": [ + "# Function 1: rentals_month\n", + "def rentals_month(engine, month, year):\n", + " \"\"\"\n", + " Retrieves rental data for a given month and year from the Sakila database.\n", + " \n", + " Parameters:\n", + " engine: Database connection engine\n", + " month: Integer representing the month (1-12)\n", + " year: Integer representing the year\n", + " \n", + " Returns:\n", + " DataFrame containing rental data for the specified month and year\n", + " \"\"\"\n", + " query = \"\"\"\n", + " SELECT rental_id, rental_date, customer_id, inventory_id, staff_id, return_date, last_update\n", + " FROM rental\n", + " WHERE MONTH(rental_date) = %(month)s AND YEAR(rental_date) = %(year)s\n", + " \"\"\"\n", + " \n", + " df = pd.read_sql(query, engine, params={'month': month, 'year': year})\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "38561a7b", + "metadata": {}, + "outputs": [], + "source": [ + "# Function 2: rental_count_month\n", + "def rental_count_month(df, month, year):\n", + " \"\"\"\n", + " Takes the DataFrame from rentals_month and returns a new DataFrame \n", + " containing the number of rentals made by each customer_id during the selected month and year.\n", + " \n", + " Parameters:\n", + " df: DataFrame from rentals_month function\n", + " month: Integer representing the month (will be used for column naming)\n", + " year: Integer representing the year (will be used for column naming)\n", + " \n", + " Returns:\n", + " DataFrame with customer_id and rental count column named according to month and year\n", + " \"\"\"\n", + " # Group by customer_id and count rentals\n", + " rental_counts = df.groupby('customer_id').size().reset_index()\n", + " \n", + " # Create column name based on month and year (format: rentals_MM_YYYY)\n", + " column_name = f\"rentals_{month:02d}_{year}\"\n", + " rental_counts.columns = ['customer_id', column_name]\n", + " \n", + " return rental_counts" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "22973696", + "metadata": {}, + "outputs": [], + "source": [ + "# Function 3: compare_rentals\n", + "def compare_rentals(df1, df2):\n", + " \"\"\"\n", + " Takes two DataFrames containing rental counts for different months and returns \n", + " a combined DataFrame with a 'difference' column showing the difference between the two months.\n", + " \n", + " Parameters:\n", + " df1: DataFrame with rental counts for first month\n", + " df2: DataFrame with rental counts for second month\n", + " \n", + " Returns:\n", + " DataFrame with customer_id, both rental count columns, and a difference column\n", + " \"\"\"\n", + " # Merge the two DataFrames on customer_id using outer join to include all customers\n", + " merged_df = pd.merge(df1, df2, on='customer_id', how='outer')\n", + " \n", + " # Fill NaN values with 0 (customers who didn't rent in one of the months)\n", + " merged_df = merged_df.fillna(0)\n", + " \n", + " # Get column names for the rental counts (excluding customer_id)\n", + " rental_cols = [col for col in merged_df.columns if col != 'customer_id']\n", + " \n", + " # Calculate the difference between the two rental count columns\n", + " if len(rental_cols) >= 2:\n", + " merged_df['difference'] = merged_df[rental_cols[1]] - merged_df[rental_cols[0]]\n", + " \n", + " return merged_df" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a3c7f776", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total rentals in May 2005: 1156\n", + "\n", + "First 5 rows of May rental data:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rental_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rental_date", + "rawType": "datetime64[ns]", + "type": "datetime" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "inventory_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "staff_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "return_date", + "rawType": "datetime64[ns]", + "type": "datetime" + }, + { + "name": "last_update", + "rawType": "datetime64[ns]", + "type": "datetime" + } + ], + "ref": "56bfadfa-c51f-42f1-82ce-61ea6cf059d9", + "rows": [ + [ + "0", + "1", + "2005-05-24 22:53:30", + "130", + "367", + "1", + "2005-05-26 22:04:30", + "2006-02-15 21:30:53" + ], + [ + "1", + "2", + "2005-05-24 22:54:33", + "459", + "1525", + "1", + "2005-05-28 19:40:33", + "2006-02-15 21:30:53" + ], + [ + "2", + "3", + "2005-05-24 23:03:39", + "408", + "1711", + "1", + "2005-06-01 22:12:39", + "2006-02-15 21:30:53" + ], + [ + "3", + "4", + "2005-05-24 23:04:41", + "333", + "2452", + "2", + "2005-06-03 01:43:41", + "2006-02-15 21:30:53" + ], + [ + "4", + "5", + "2005-05-24 23:05:21", + "222", + "2079", + "1", + "2005-06-02 04:33:21", + "2006-02-15 21:30:53" + ] + ], + "shape": { + "columns": 7, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_datecustomer_idinventory_idstaff_idreturn_datelast_update
012005-05-24 22:53:3013036712005-05-26 22:04:302006-02-15 21:30:53
122005-05-24 22:54:33459152512005-05-28 19:40:332006-02-15 21:30:53
232005-05-24 23:03:39408171112005-06-01 22:12:392006-02-15 21:30:53
342005-05-24 23:04:41333245222005-06-03 01:43:412006-02-15 21:30:53
452005-05-24 23:05:21222207912005-06-02 04:33:212006-02-15 21:30:53
\n", + "
" + ], + "text/plain": [ + " rental_id rental_date customer_id inventory_id staff_id \\\n", + "0 1 2005-05-24 22:53:30 130 367 1 \n", + "1 2 2005-05-24 22:54:33 459 1525 1 \n", + "2 3 2005-05-24 23:03:39 408 1711 1 \n", + "3 4 2005-05-24 23:04:41 333 2452 2 \n", + "4 5 2005-05-24 23:05:21 222 2079 1 \n", + "\n", + " return_date last_update \n", + "0 2005-05-26 22:04:30 2006-02-15 21:30:53 \n", + "1 2005-05-28 19:40:33 2006-02-15 21:30:53 \n", + "2 2005-06-01 22:12:39 2006-02-15 21:30:53 \n", + "3 2005-06-03 01:43:41 2006-02-15 21:30:53 \n", + "4 2005-06-02 04:33:21 2006-02-15 21:30:53 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Analysis: Get rental data for May 2005\n", + "may_rentals = rentals_month(engine, 5, 2005)\n", + "print(f\"Total rentals in May 2005: {len(may_rentals)}\")\n", + "print(\"\\nFirst 5 rows of May rental data:\")\n", + "display(may_rentals.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "008ff95e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total rentals in June 2005: 2311\n", + "\n", + "First 5 rows of June rental data:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rental_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rental_date", + "rawType": "datetime64[ns]", + "type": "datetime" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "inventory_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "staff_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "return_date", + "rawType": "datetime64[ns]", + "type": "datetime" + }, + { + "name": "last_update", + "rawType": "datetime64[ns]", + "type": "datetime" + } + ], + "ref": "a8d9f2da-1940-49f5-8be4-97a72ffeb81a", + "rows": [ + [ + "0", + "1158", + "2005-06-14 22:53:33", + "416", + "1632", + "2", + "2005-06-18 21:37:33", + "2006-02-15 21:30:53" + ], + [ + "1", + "1159", + "2005-06-14 22:55:13", + "516", + "4395", + "1", + "2005-06-17 02:11:13", + "2006-02-15 21:30:53" + ], + [ + "2", + "1160", + "2005-06-14 23:00:34", + "239", + "2795", + "2", + "2005-06-18 01:58:34", + "2006-02-15 21:30:53" + ], + [ + "3", + "1161", + "2005-06-14 23:07:08", + "285", + "1690", + "1", + "2005-06-21 17:12:08", + "2006-02-15 21:30:53" + ], + [ + "4", + "1162", + "2005-06-14 23:09:38", + "310", + "987", + "1", + "2005-06-23 22:00:38", + "2006-02-15 21:30:53" + ] + ], + "shape": { + "columns": 7, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
rental_idrental_datecustomer_idinventory_idstaff_idreturn_datelast_update
011582005-06-14 22:53:33416163222005-06-18 21:37:332006-02-15 21:30:53
111592005-06-14 22:55:13516439512005-06-17 02:11:132006-02-15 21:30:53
211602005-06-14 23:00:34239279522005-06-18 01:58:342006-02-15 21:30:53
311612005-06-14 23:07:08285169012005-06-21 17:12:082006-02-15 21:30:53
411622005-06-14 23:09:3831098712005-06-23 22:00:382006-02-15 21:30:53
\n", + "
" + ], + "text/plain": [ + " rental_id rental_date customer_id inventory_id staff_id \\\n", + "0 1158 2005-06-14 22:53:33 416 1632 2 \n", + "1 1159 2005-06-14 22:55:13 516 4395 1 \n", + "2 1160 2005-06-14 23:00:34 239 2795 2 \n", + "3 1161 2005-06-14 23:07:08 285 1690 1 \n", + "4 1162 2005-06-14 23:09:38 310 987 1 \n", + "\n", + " return_date last_update \n", + "0 2005-06-18 21:37:33 2006-02-15 21:30:53 \n", + "1 2005-06-17 02:11:13 2006-02-15 21:30:53 \n", + "2 2005-06-18 01:58:34 2006-02-15 21:30:53 \n", + "3 2005-06-21 17:12:08 2006-02-15 21:30:53 \n", + "4 2005-06-23 22:00:38 2006-02-15 21:30:53 " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Analysis: Get rental data for June 2005\n", + "june_rentals = rentals_month(engine, 6, 2005)\n", + "print(f\"Total rentals in June 2005: {len(june_rentals)}\")\n", + "print(\"\\nFirst 5 rows of June rental data:\")\n", + "display(june_rentals.head())" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "38a29783", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of customers who rented in May 2005: 520\n", + "Number of customers who rented in June 2005: 590\n", + "\n", + "Top 5 customers in May:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rentals_05_2005", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "e2059d70-fff1-43a0-a49e-cae808364444", + "rows": [ + [ + "168", + "197", + "8" + ], + [ + "92", + "109", + "7" + ], + [ + "441", + "506", + "7" + ], + [ + "15", + "19", + "6" + ], + [ + "43", + "53", + "6" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005
1681978
921097
4415067
15196
43536
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005\n", + "168 197 8\n", + "92 109 7\n", + "441 506 7\n", + "15 19 6\n", + "43 53 6" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Top 5 customers in June:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rentals_06_2005", + "rawType": "int64", + "type": "integer" + } + ], + "ref": "b251ba73-dd85-4905-9dcf-0171acf69e25", + "rows": [ + [ + "30", + "31", + "11" + ], + [ + "445", + "454", + "10" + ], + [ + "209", + "213", + "9" + ], + [ + "263", + "267", + "9" + ], + [ + "291", + "295", + "9" + ] + ], + "shape": { + "columns": 2, + "rows": 5 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_06_2005
303111
44545410
2092139
2632679
2912959
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_06_2005\n", + "30 31 11\n", + "445 454 10\n", + "209 213 9\n", + "263 267 9\n", + "291 295 9" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Get rental counts per customer for each month\n", + "may_counts = rental_count_month(may_rentals, 5, 2005)\n", + "june_counts = rental_count_month(june_rentals, 6, 2005)\n", + "\n", + "print(f\"Number of customers who rented in May 2005: {len(may_counts)}\")\n", + "print(f\"Number of customers who rented in June 2005: {len(june_counts)}\")\n", + "\n", + "print(\"\\nTop 5 customers in May:\")\n", + "display(may_counts.nlargest(5, 'rentals_05_2005'))\n", + "\n", + "print(\"\\nTop 5 customers in June:\")\n", + "display(june_counts.nlargest(5, 'rentals_06_2005'))" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "5e8120d8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total customers active in either month: 598\n", + "Customers active in BOTH May and June: 512\n", + "\n", + "Comparison DataFrame (first 10 rows):\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rentals_05_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "rentals_06_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "difference", + "rawType": "float64", + "type": "float" + } + ], + "ref": "e784c49d-37c6-43ef-84a7-b695032c4b62", + "rows": [ + [ + "0", + "1", + "2.0", + "7.0", + "5.0" + ], + [ + "1", + "2", + "1.0", + "1.0", + "0.0" + ], + [ + "2", + "3", + "2.0", + "4.0", + "2.0" + ], + [ + "3", + "4", + "0.0", + "6.0", + "6.0" + ], + [ + "4", + "5", + "3.0", + "5.0", + "2.0" + ], + [ + "5", + "6", + "3.0", + "4.0", + "1.0" + ], + [ + "6", + "7", + "5.0", + "5.0", + "0.0" + ], + [ + "7", + "8", + "1.0", + "3.0", + "2.0" + ], + [ + "8", + "9", + "3.0", + "2.0", + "-1.0" + ], + [ + "9", + "10", + "1.0", + "5.0", + "4.0" + ] + ], + "shape": { + "columns": 4, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005rentals_06_2005difference
012.07.05.0
121.01.00.0
232.04.02.0
340.06.06.0
453.05.02.0
563.04.01.0
675.05.00.0
781.03.02.0
893.02.0-1.0
9101.05.04.0
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "0 1 2.0 7.0 5.0\n", + "1 2 1.0 1.0 0.0\n", + "2 3 2.0 4.0 2.0\n", + "3 4 0.0 6.0 6.0\n", + "4 5 3.0 5.0 2.0\n", + "5 6 3.0 4.0 1.0\n", + "6 7 5.0 5.0 0.0\n", + "7 8 1.0 3.0 2.0\n", + "8 9 3.0 2.0 -1.0\n", + "9 10 1.0 5.0 4.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Compare the two months\n", + "comparison = compare_rentals(may_counts, june_counts)\n", + "print(f\"Total customers active in either month: {len(comparison)}\")\n", + "\n", + "# Show customers who were active in both months\n", + "active_both_months = comparison[(comparison['rentals_05_2005'] > 0) & (comparison['rentals_06_2005'] > 0)]\n", + "print(f\"Customers active in BOTH May and June: {len(active_both_months)}\")\n", + "\n", + "print(\"\\nComparison DataFrame (first 10 rows):\")\n", + "display(comparison.head(10))" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "1e7c0276", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 10 customers by total activity:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rentals_05_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "rentals_06_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "difference", + "rawType": "float64", + "type": "float" + }, + { + "name": "total_rentals", + "rawType": "float64", + "type": "float" + } + ], + "ref": "02474e76-96f4-4f37-92cd-65e6127edc7e", + "rows": [ + [ + "195", + "197", + "8.0", + "8.0", + "0.0", + "16.0" + ], + [ + "175", + "176", + "5.0", + "8.0", + "3.0", + "13.0" + ], + [ + "369", + "371", + "6.0", + "7.0", + "1.0", + "13.0" + ], + [ + "108", + "109", + "7.0", + "5.0", + "-2.0", + "12.0" + ], + [ + "194", + "196", + "4.0", + "8.0", + "4.0", + "12.0" + ], + [ + "254", + "256", + "5.0", + "7.0", + "2.0", + "12.0" + ], + [ + "265", + "267", + "3.0", + "9.0", + "6.0", + "12.0" + ], + [ + "504", + "506", + "7.0", + "5.0", + "-2.0", + "12.0" + ], + [ + "524", + "526", + "3.0", + "9.0", + "6.0", + "12.0" + ], + [ + "30", + "31", + "0.0", + "11.0", + "11.0", + "11.0" + ] + ], + "shape": { + "columns": 5, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005rentals_06_2005differencetotal_rentals
1951978.08.00.016.0
1751765.08.03.013.0
3693716.07.01.013.0
1081097.05.0-2.012.0
1941964.08.04.012.0
2542565.07.02.012.0
2652673.09.06.012.0
5045067.05.0-2.012.0
5245263.09.06.012.0
30310.011.011.011.0
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005 rentals_06_2005 difference total_rentals\n", + "195 197 8.0 8.0 0.0 16.0\n", + "175 176 5.0 8.0 3.0 13.0\n", + "369 371 6.0 7.0 1.0 13.0\n", + "108 109 7.0 5.0 -2.0 12.0\n", + "194 196 4.0 8.0 4.0 12.0\n", + "254 256 5.0 7.0 2.0 12.0\n", + "265 267 3.0 9.0 6.0 12.0\n", + "504 506 7.0 5.0 -2.0 12.0\n", + "524 526 3.0 9.0 6.0 12.0\n", + "30 31 0.0 11.0 11.0 11.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Statistical analysis\n", + "comparison['total_rentals'] = comparison['rentals_05_2005'] + comparison['rentals_06_2005']\n", + "\n", + "print(\"Top 10 customers by total activity:\")\n", + "top_customers = comparison.nlargest(10, 'total_rentals')[['customer_id', 'rentals_05_2005', 'rentals_06_2005', 'difference', 'total_rentals']]\n", + "display(top_customers)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "b5bb1eb1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 10 customers with biggest increase from May to June:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rentals_05_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "rentals_06_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "difference", + "rawType": "float64", + "type": "float" + } + ], + "ref": "dec4a1d9-a832-4a69-bb1e-0a0d446836e2", + "rows": [ + [ + "30", + "31", + "0.0", + "11.0", + "11.0" + ], + [ + "327", + "329", + "0.0", + "9.0", + "9.0" + ], + [ + "452", + "454", + "1.0", + "10.0", + "9.0" + ], + [ + "177", + "178", + "0.0", + "8.0", + "8.0" + ], + [ + "211", + "213", + "1.0", + "9.0", + "8.0" + ], + [ + "266", + "268", + "0.0", + "8.0", + "8.0" + ], + [ + "293", + "295", + "1.0", + "9.0", + "8.0" + ], + [ + "334", + "336", + "0.0", + "8.0", + "8.0" + ], + [ + "338", + "340", + "0.0", + "8.0", + "8.0" + ], + [ + "455", + "457", + "1.0", + "9.0", + "8.0" + ] + ], + "shape": { + "columns": 4, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005rentals_06_2005difference
30310.011.011.0
3273290.09.09.0
4524541.010.09.0
1771780.08.08.0
2112131.09.08.0
2662680.08.08.0
2932951.09.08.0
3343360.08.08.0
3383400.08.08.0
4554571.09.08.0
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "30 31 0.0 11.0 11.0\n", + "327 329 0.0 9.0 9.0\n", + "452 454 1.0 10.0 9.0\n", + "177 178 0.0 8.0 8.0\n", + "211 213 1.0 9.0 8.0\n", + "266 268 0.0 8.0 8.0\n", + "293 295 1.0 9.0 8.0\n", + "334 336 0.0 8.0 8.0\n", + "338 340 0.0 8.0 8.0\n", + "455 457 1.0 9.0 8.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Customers with biggest increase from May to June\n", + "print(\"Top 10 customers with biggest increase from May to June:\")\n", + "biggest_increase = comparison.nlargest(10, 'difference')[['customer_id', 'rentals_05_2005', 'rentals_06_2005', 'difference']]\n", + "display(biggest_increase)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "b2017504", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Top 10 customers with biggest decrease from May to June:\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "customer_id", + "rawType": "int64", + "type": "integer" + }, + { + "name": "rentals_05_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "rentals_06_2005", + "rawType": "float64", + "type": "float" + }, + { + "name": "difference", + "rawType": "float64", + "type": "float" + } + ], + "ref": "3e378ba6-8c50-42f9-ad99-caf3da230262", + "rows": [ + [ + "205", + "207", + "6.0", + "1.0", + "-5.0" + ], + [ + "13", + "14", + "5.0", + "1.0", + "-4.0" + ], + [ + "160", + "161", + "6.0", + "2.0", + "-4.0" + ], + [ + "196", + "198", + "5.0", + "1.0", + "-4.0" + ], + [ + "248", + "250", + "5.0", + "1.0", + "-4.0" + ], + [ + "272", + "274", + "6.0", + "2.0", + "-4.0" + ], + [ + "594", + "596", + "6.0", + "2.0", + "-4.0" + ], + [ + "18", + "19", + "6.0", + "3.0", + "-3.0" + ], + [ + "123", + "124", + "4.0", + "1.0", + "-3.0" + ], + [ + "220", + "222", + "5.0", + "2.0", + "-3.0" + ] + ], + "shape": { + "columns": 4, + "rows": 10 + } + }, + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
customer_idrentals_05_2005rentals_06_2005difference
2052076.01.0-5.0
13145.01.0-4.0
1601616.02.0-4.0
1961985.01.0-4.0
2482505.01.0-4.0
2722746.02.0-4.0
5945966.02.0-4.0
18196.03.0-3.0
1231244.01.0-3.0
2202225.02.0-3.0
\n", + "
" + ], + "text/plain": [ + " customer_id rentals_05_2005 rentals_06_2005 difference\n", + "205 207 6.0 1.0 -5.0\n", + "13 14 5.0 1.0 -4.0\n", + "160 161 6.0 2.0 -4.0\n", + "196 198 5.0 1.0 -4.0\n", + "248 250 5.0 1.0 -4.0\n", + "272 274 6.0 2.0 -4.0\n", + "594 596 6.0 2.0 -4.0\n", + "18 19 6.0 3.0 -3.0\n", + "123 124 4.0 1.0 -3.0\n", + "220 222 5.0 2.0 -3.0" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Customers with biggest decrease from May to June\n", + "print(\"Top 10 customers with biggest decrease from May to June:\")\n", + "biggest_decrease = comparison.nsmallest(10, 'difference')[['customer_id', 'rentals_05_2005', 'rentals_06_2005', 'difference']]\n", + "display(biggest_decrease)" + ] + }, + { + "cell_type": "markdown", + "id": "8dbfc444", + "metadata": {}, + "source": [ + "## Summary\n", + "\n", + "This analysis shows:\n", + "1. The total number of rentals in each month\n", + "2. Which customers were active in both months\n", + "3. How customer activity changed between May and June 2005\n", + "4. The customers with the biggest increases and decreases in rental activity" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}