diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..de288e1 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "python.formatting.provider": "black" +} \ No newline at end of file diff --git a/src/activation_functions.py b/src/activation_functions.py new file mode 100644 index 0000000..119551a --- /dev/null +++ b/src/activation_functions.py @@ -0,0 +1,156 @@ +""" +Activation Functions for neural net +""" + +import numpy as np + +from loss_functions import CrossEntropy + + +class ActivationLayer: + """ + Represents an activation function + Arguments: + - size = number of neurons in layer + - next = next step in the neural net + Notation: input is Z, output is A + """ + + name = ... + equation = ... + + def __init__(self, size, next_step): + self.size = size + self.next = next_step + + def deriv(self, Z): + ... + + def deriv_loss(self, DA, dZ): + """ + returns the derivative matrix of Loss with respect to input to ReLU Z + - override this if there is a mathematical shortcut or is not caluclated element-wise + - is element-wise multiplication by default because apply() is usually element-wise + DA = self.next.deriv_loss(self.apply(Z)) + dZ = self.deriv(Z) + """ + print(f"{self.name} - deriv_loss") + print(f"{dZ = }") + return DA * dZ + + +class Softmax(ActivationLayer): + """ + softmax activation function + Softmax(Z)[i] = e^Z[i] / sum_i(e^Z) + Y_hat = Softmax(Z) + """ + + name = "Softmax" + equation = "e^Z[i] / sum_i(e^Z)" + + def apply(self, Z: np.array): + """ + returns the softmax array of Z + called Y_hat since only used at termination of nerual net + """ + # TODO: reapply stabilizing + # collapses 1 dim of array + max_Z = np.amax(Z, axis=0).reshape(1, Z.shape[1]) # Get the column-wise maximum + eZ = np.exp(Z - max_Z) # For stability + return eZ / eZ.sum(axis=0, keepdims=True) + + def deriv(self, Y_hat): + """ + returns a derivative matrix of how each value in Z effects + each value in A + A = self.apply(Z) + """ + """ + dY_hat/dZ2: + dY_hat/dZ2.shape should be {n,m} + from (10) Y_hat{n,m} = the estimate of Y = softmax(Z2{n,m}): + given some i,j in range(n) and k,l in range(m): + Y_hat[i,k] changes with respect to Z2[j,l] only when k == l + for simplicity, assume k=l and thus drop those terms + dY_hat[i]/dZ2 has dimension {n} + dY_hat[i]/dZ2[j] = + if i == j --> softmax(Z2[j])*(1-softmax(Z2[j]) + if i != j --> -softmax(Z2[i])*softmax(Z2[j]) + dY_hat/dZ2 has dimension [n,n] for each entry in m + dY_hat/dZ2[i,j,k] = + if i == j --> softmax(Z2[j,k])*(1-softmax(Z2[j,k]) + if i != j --> -softmax(Z2[i,k])*softmax(Z2[j,k]) + for simplicity, call p[i, ...] = softmax(Z2[i, ...]). Thus: + (13) dY_hat/dZ2[i,j,k]{n,n,m} = + if i == j --> p[j,k]*(1-p[j,k]) + if i != j --> -p[i,k]*p[j,k] + """ + softmax = Y_hat + identity = np.eye(softmax.shape[-1]) + # must instantiate with proper dims first + t1 = np.zeros(softmax.shape + (softmax.shape[-1],), dtype=np.float32) + t2 = np.zeros(softmax.shape + (softmax.shape[-1],), dtype=np.float32) + t1 = np.einsum("ij,ik->ijk", softmax, softmax) # handles rest when i != j + t2 = np.einsum("ij,jk->ijk", softmax, identity) # handles only when i == j + return t2 - t1 + + def deriv_loss(self, Y_hat): + """ + returns the derivative matrix of Loss with respect to input to softmax Z + - currently only supports loss == CrossEntropy + Y_hat = self.apply(Z) + """ + if isinstance(self.next, CrossEntropy): + """ + DZ2 = dL/dZ2: + DZ2.shape should be {n,m} + DZ2 = dL/dY_hat * dY_hat/dZ2 + for now, drop m, so L has dim 1 while Z2 has dim {n} + let i,j in range(n) + from (13) dY_hat/dZ2[i,j,k]{n,n,m} = + if i == j --> p[j,k]*(1-p[j,k]) + if i != j --> -p[i,k]*p[j,k]: + dL/dZ2[j] = sum over i of dL/dY_hat[i] * dY_hat[i]/dZ2[j] + = {when i == j} - Y[j]/Y_hat[j] * Y_hat[j]*(1-Y_hat[j]) + + {sum over i when i != j of} (- (Y[i] / Y_hat[i]) * -Y_hat[i]*Y_hat[j] ) + = -Y[j] * (1 - Y_hat[j]) - Y_hat[j] * {sum over i when i != j of} Y[i] + = -Y[j] + Y[j] * Y_hat[j] - Y_hat[j] * (-Y[j] + {sum over i of} Y[i]) # added Y[j] into summation + = -Y[j] + Y_hat[j] * (-Y[j] - (-Y[j] + 1)) # NOTE: {sum over i of} Y[i] = 1 since + # Y[i] = 0 for all but 1 i, where it equals 1 + = -Y[j] + Y_hat[j] * 1 = -Y[j] + Y_hat[j] + Adding back in k in range(m): + dL/dZ2[j,k] = -Y[j,k] + Y_hat[j,k] + (14) DZ2{n,m} = -Y + Y_hat + """ + print(f"{self.name} - deriv_loss") + return -self.next.Y + Y_hat + + raise NotImplementedError( + "currently only implemented as last layer activation " + + "function with CrossEntropy as loss function" + ) + + +class ReLU(ActivationLayer): + """ + Rectified Linear Unit activation function + ReLU(Z)[i] = max(Z[i], 0) + """ + + name = "Rectified Linear Unit" + equation = "max(Z[i], 0)" + + def apply(self, Z: np.array): + """rectified linear unit activation function""" + return np.maximum(Z, 0) + + def deriv(self, Z): + """ + returns a derivative matrix of how each value in Z effects + each value in A + """ + """ + dA/dZ[i] = 1 if Z[i] > 1, else 0 + """ + return Z > 0 diff --git a/src/loss_functions.py b/src/loss_functions.py new file mode 100644 index 0000000..47b6f84 --- /dev/null +++ b/src/loss_functions.py @@ -0,0 +1,55 @@ +""" +Loss functions for neural net +""" + +import numpy as np + + +class Loss: + """ + Loss function + evaluates distance between prediction (Y_hat) and actual value (Y) + """ + + name = ... + equation = ... + out_shape = (1,) # returns a constant + + def __init__(self, Y): + self.Y = np.array(Y) + + def loss(self, Y_hat): + ... + + +class CrossEntropy(Loss): + """ + Y = actual values + Y_hat = estimate of Y + + L(Y,Y_hat) = - sum_over_i(Y_hat[i] * log(Y[i])) + """ + + name = "Cross Entropy" + equation = "- sum_over_i(Y[i] * log(Y_hat[i]))" + + def loss(self, Y_hat): + Y_hat_clipped = np.clip(Y_hat, 1e-7, 1) # to remove errors when estimate is 0 + targeted_Y_hat = np.sum(Y_hat_clipped * self.Y, axis=0) + return np.mean(-np.log(targeted_Y_hat)) + + def deriv(self, Y_hat): + """ + returns how self.loss changes with regards to a change in each value in Y_hat + + dL/dY_hat{n} = (- {sum over i of} (Y[i] / Y_hat[i])) / m + = -np.mean(self.Y / Y_hat, axis=1) + """ + return -np.mean(self.Y / Y_hat, axis=1) + + def deriv_loss(self, Y_hat): + """ + since loss is it's return + deriv_loss == deriv + """ + return self.deriv(Y_hat) diff --git a/src/lukestest.ipynb b/src/lukestest.ipynb index f776b8e..5c17503 100644 --- a/src/lukestest.ipynb +++ b/src/lukestest.ipynb @@ -1,5 +1,1213 @@ { "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "10" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"import our data\"\"\"\n", + "\n", + "import numpy as np\n", + "import os\n", + "from typing import Tuple\n", + "import pandas as pd\n", + "\n", + "# DATA FROM HERE: https://pjreddie.com/projects/mnist-in-csv/\n", + "file_test = '../data/MNIST/mnist_test.csv'\n", + "file_train = '../data/MNIST/mnist_train.csv'\n", + "\n", + "\n", + "def get_data_from_csv(file: str) -> Tuple[np.array, int, int]:\n", + " \"\"\"takes data from file (csv type) and returns\n", + " a shuffled version of the data in an np.array form,\n", + " along with two ints:\n", + " m - number of test examples\n", + " n - number of points per example (including integrated labels)\"\"\"\n", + " assert os.path.exists(file), f\"{file} does not exist\"\n", + "\n", + " data = pd.read_csv(file)\n", + " m, n = data.shape\n", + " data = np.array(data)\n", + " np.random.shuffle(data)\n", + "\n", + " return (data, m, n)\n", + "\n", + "\n", + "def get_labels_and_data_1st_column(data: np.array) -> Tuple[np.array, np.array]:\n", + " \"\"\"takes an np.array of data, returns (Transposed) labels (Y) and data (X)\"\"\"\n", + " data = data.T\n", + " Y = data[0]\n", + " X = data[1:]/255.\n", + " return (Y, X)\n", + "\n", + "\n", + "data_test, m_test, n_test = get_data_from_csv(file_test)\n", + "Y_test, X_test = get_labels_and_data_1st_column(data_test)\n", + "\n", + "data_train, m_train, n_train = get_data_from_csv(file_train)\n", + "Y_train, X_train = get_labels_and_data_1st_column(data_train)\n", + "\n", + "\n", + "\"\"\"making sure that our Y_test/Y_train are actually labels\"\"\"\n", + "\n", + "assert Y_test.max() == 9\n", + "assert Y_train.max() == 9\n", + "assert X_test[0].max() != 9\n", + "assert X_train[0].max() != 9\n", + "\n", + "\n", + "def one_hot_encode(Y: np.array, classes = 10):\n", + " \"\"\"transforms an array into 1 hot encodings:\n", + " [0,3,2] -> [ [1,0,0,0], [0,0,0,1], [0,0,1,0] ]\n", + " Assumes that max(Y) is the highest possible enconding.\"\"\"\n", + " # first instantiate 0's which should be an array of len(Y) max(Y) \n", + " one_hot = np.zeros((Y.size,classes))\n", + " one_hot[np.arange(Y.size), Y] = 1\n", + " one_hot = one_hot.T\n", + " return one_hot\n", + "\n", + "Y = one_hot_encode(Y_train)\n", + "X = X_train\n", + "\n", + "m = X.shape[1] # number of examples\n", + "x = X.shape[0] # number of datapoints per sample\n", + "n = Y.shape[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Y.shape = (10, 500)\n", + "X.shape = (5, 500)\n", + "2.343816072341479\n", + "[[0.833 1.087 0.986 ... 0.966 1.091 0.586]\n", + " [0. 0. 0. ... 0. 0. 0. ]\n", + " [0.094 0. 0. ... 0.033 0. 0. ]\n", + " ...\n", + " [0.107 0.037 0.251 ... 0.191 0.26 0.413]\n", + " [0.494 0.448 0.627 ... 0.687 0.25 0.998]\n", + " [0. 0. 0. ... 0. 0.145 0. ]]\n", + "[[0. 0. 0. ... 0. 0. 0. ]\n", + " [0.27 0.354 0.348 ... 0.245 0.406 0.193]\n", + " [0.597 0.577 0.464 ... 0.52 0.716 0.459]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0. ]\n", + " [0.091 0. 0.104 ... 0.135 0. 0.379]\n", + " [0.358 0.482 0.58 ... 0.542 0.565 0.413]]\n", + "[[0.076 0.081 0.081 ... 0.078 0.082 0.071]\n", + " [0.162 0.171 0.183 ... 0.179 0.175 0.168]\n", + " [0.082 0.081 0.087 ... 0.085 0.077 0.095]\n", + " ...\n", + " [0.112 0.106 0.1 ... 0.104 0.108 0.111]\n", + " [0.056 0.058 0.063 ... 0.061 0.055 0.059]\n", + " [0.091 0.091 0.092 ... 0.09 0.086 0.089]]\n", + "[[0. 1. 0. ... 0. 0. 0.]\n", + " [1. 0. 1. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]]\n", + "Softmax - deriv_loss\n", + "Y_hat = array([[0.076, 0.081, 0.081, ..., 0.078, 0.082, 0.071],\n", + " [0.162, 0.171, 0.183, ..., 0.179, 0.175, 0.168],\n", + " [0.082, 0.081, 0.087, ..., 0.085, 0.077, 0.095],\n", + " ...,\n", + " [0.112, 0.106, 0.1 , ..., 0.104, 0.108, 0.111],\n", + " [0.056, 0.058, 0.063, ..., 0.061, 0.055, 0.059],\n", + " [0.091, 0.091, 0.092, ..., 0.09 , 0.086, 0.089]])\n", + "DZ = array([[ 0.076, -0.919, 0.081, ..., 0.078, 0.082, 0.071],\n", + " [-0.838, 0.171, -0.817, ..., 0.179, 0.175, 0.168],\n", + " [ 0.082, 0.081, 0.087, ..., 0.085, 0.077, 0.095],\n", + " ...,\n", + " [ 0.112, 0.106, 0.1 , ..., 0.104, 0.108, 0.111],\n", + " [ 0.056, 0.058, 0.063, ..., 0.061, 0.055, 0.059],\n", + " [ 0.091, 0.091, 0.092, ..., 0.09 , 0.086, 0.089]])\n", + "layer_index = 2, type(layer) = \n", + "A = array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.27 , 0.354, 0.348, ..., 0.245, 0.406, 0.193],\n", + " [0.597, 0.577, 0.464, ..., 0.52 , 0.716, 0.459],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.091, 0. , 0.104, ..., 0.135, 0. , 0.379],\n", + " [0.358, 0.482, 0.58 , ..., 0.542, 0.565, 0.413]])\n", + "DA = array([[-0.129, -0.396, -0.127, ..., 0.128, -0.113, -0.339],\n", + " [ 0.052, -0.177, 0.053, ..., -0.237, 0.224, 0.12 ],\n", + " [ 0.195, 0.061, 0.175, ..., -0.167, -0.365, -0.092],\n", + " ...,\n", + " [ 0.086, -0.513, 0.095, ..., 0.352, -0.269, -0.001],\n", + " [ 0.077, 0.217, 0.07 , ..., -0.082, 0.198, -0.271],\n", + " [-0.452, -0.239, -0.431, ..., 0.144, -0.125, 0.36 ]])\n", + "DW = array([[ 0.000e+00, -2.602e+00, -4.379e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 3.437e-02, 0.000e+00, -2.008e+00, -4.323e+00],\n", + " [ 0.000e+00, 1.310e+01, 2.270e+01, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 3.405e-01, 0.000e+00, 3.567e+00, 1.989e+01],\n", + " [ 0.000e+00, -6.190e+00, -1.158e+01, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -3.162e-01, 0.000e+00, -1.957e+00, -9.821e+00],\n", + " [ 0.000e+00, -5.096e+00, -1.112e+01, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -2.141e-02, 0.000e+00, -1.188e+00, -6.844e+00],\n", + " [ 0.000e+00, 5.151e+00, 9.424e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -9.267e-02, 0.000e+00, 1.602e+00, 7.448e+00],\n", + " [ 0.000e+00, 4.618e+00, 7.254e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 1.374e-01, 0.000e+00, 2.018e+00, 7.913e+00],\n", + " [ 0.000e+00, -8.604e-01, -7.637e-01, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 7.507e-02, 0.000e+00, 2.865e-01, -6.106e-01],\n", + " [ 0.000e+00, -3.509e-01, 1.105e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -8.222e-02, 0.000e+00, 4.728e-01, 6.141e-01],\n", + " [ 0.000e+00, -2.843e+00, -5.151e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 8.677e-02, 0.000e+00, -1.007e+00, -5.568e+00],\n", + " [ 0.000e+00, -4.928e+00, -7.495e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -1.616e-01, 0.000e+00, -1.785e+00, -8.703e+00]])\n", + "Db = array([[ -8.972],\n", + " [ 39.68 ],\n", + " [-19.809],\n", + " [-16.289],\n", + " [ 15.535],\n", + " [ 14.169],\n", + " [ -1.446],\n", + " [ 1.428],\n", + " [ -8.903],\n", + " [-15.392]])\n", + "layer_index = 3, type(layer) = \n", + "Z = array([[-0.443, -0.437, -0.338, ..., -0.316, -0.48 , -0.214],\n", + " [ 0.27 , 0.354, 0.348, ..., 0.245, 0.406, 0.193],\n", + " [ 0.597, 0.577, 0.464, ..., 0.52 , 0.716, 0.459],\n", + " ...,\n", + " [-0.567, -0.697, -0.627, ..., -0.696, -0.662, -0.43 ],\n", + " [ 0.091, -0.004, 0.104, ..., 0.135, -0.115, 0.379],\n", + " [ 0.358, 0.482, 0.58 , ..., 0.542, 0.565, 0.413]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[False, False, False, ..., False, False, False],\n", + " [ True, True, True, ..., True, True, True],\n", + " [ True, True, True, ..., True, True, True],\n", + " ...,\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, False, True, ..., True, False, True],\n", + " [ True, True, True, ..., True, True, True]])\n", + "DZ = array([[-0. , -0. , -0. , ..., 0. , -0. , -0. ],\n", + " [ 0.052, -0.177, 0.053, ..., -0.237, 0.224, 0.12 ],\n", + " [ 0.195, 0.061, 0.175, ..., -0.167, -0.365, -0.092],\n", + " ...,\n", + " [ 0. , -0. , 0. , ..., 0. , -0. , -0. ],\n", + " [ 0.077, 0. , 0.07 , ..., -0.082, 0. , -0.271],\n", + " [-0.452, -0.239, -0.431, ..., 0.144, -0.125, 0.36 ]])\n", + "layer_index = 4, type(layer) = \n", + "A = array([[0.833, 1.087, 0.986, ..., 0.966, 1.091, 0.586],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.094, 0. , 0. , ..., 0.033, 0. , 0. ],\n", + " ...,\n", + " [0.107, 0.037, 0.251, ..., 0.191, 0.26 , 0.413],\n", + " [0.494, 0.448, 0.627, ..., 0.687, 0.25 , 0.998],\n", + " [0. , 0. , 0. , ..., 0. , 0.145, 0. ]])\n", + "DA = array([[ 0.038, -0.116, -0.008, ..., -0.09 , 0.044, 0.169],\n", + " [-0.028, 0.149, 0.082, ..., -0.016, -0.19 , -0.117],\n", + " [ 0.067, 0.124, 0.079, ..., 0.041, -0.088, -0.139],\n", + " ...,\n", + " [-0.046, 0.075, 0.095, ..., -0.124, -0.101, 0.082],\n", + " [ 0.308, 0.006, 0.139, ..., -0.034, -0. , -0.249],\n", + " [ 0.053, -0.026, -0.038, ..., 0.027, -0.24 , -0.113]])\n", + "DW = array([[ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [-3.066e+00, 7.596e-03, 7.429e-03, 0.000e+00, 1.129e-01,\n", + " -3.375e-01, -1.485e+00, -5.278e-01, -2.397e+00, 2.650e-01],\n", + " [ 1.113e+01, 3.053e-02, 4.569e-02, 0.000e+00, 3.886e-01,\n", + " 2.365e+00, 5.791e+00, 3.871e+00, 8.694e+00, 2.107e-01],\n", + " [ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [-3.410e+00, 0.000e+00, -4.596e-02, 0.000e+00, -7.219e-03,\n", + " 1.166e-01, -5.660e-01, -2.559e-01, -2.526e+00, -9.460e-02],\n", + " [ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [ 1.208e+00, 9.739e-03, -1.089e-01, 0.000e+00, -1.161e-01,\n", + " 3.035e-01, 1.293e+00, 1.082e-01, 1.335e+00, -1.447e-01],\n", + " [ 7.104e+00, -6.681e-02, 2.333e-01, 0.000e+00, -1.813e-01,\n", + " 6.735e-01, -1.334e-01, -1.315e-01, 1.985e+00, -2.371e-01]])\n", + "Db = array([[ 0. ],\n", + " [-3.042],\n", + " [11.148],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [-4.406],\n", + " [ 0. ],\n", + " [ 1.221],\n", + " [ 6.493]])\n", + "layer_index = 5, type(layer) = \n", + "Z = array([[ 0.833, 1.087, 0.986, ..., 0.966, 1.091, 0.586],\n", + " [-0.396, -0.41 , -0.676, ..., -0.776, -0.633, -0.397],\n", + " [ 0.094, -0.045, -0.151, ..., 0.033, -0.019, -0.181],\n", + " ...,\n", + " [ 0.107, 0.037, 0.251, ..., 0.191, 0.26 , 0.413],\n", + " [ 0.494, 0.448, 0.627, ..., 0.687, 0.25 , 0.998],\n", + " [-0.024, -0.337, -0.219, ..., -0.284, 0.145, -0.26 ]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[ True, True, True, ..., True, True, True],\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, False, False, ..., True, False, False],\n", + " ...,\n", + " [ True, True, True, ..., True, True, True],\n", + " [ True, True, True, ..., True, True, True],\n", + " [False, False, False, ..., False, True, False]])\n", + "DZ = array([[ 0.038, -0.116, -0.008, ..., -0.09 , 0.044, 0.169],\n", + " [-0. , 0. , 0. , ..., -0. , -0. , -0. ],\n", + " [ 0.067, 0. , 0. , ..., 0.041, -0. , -0. ],\n", + " ...,\n", + " [-0.046, 0.075, 0.095, ..., -0.124, -0.101, 0.082],\n", + " [ 0.308, 0.006, 0.139, ..., -0.034, -0. , -0.249],\n", + " [ 0. , -0. , -0. , ..., 0. , -0.24 , -0. ]])\n", + "layer_index = 6, type(layer) = \n", + "A = array([[0.51 , 0.547, 0.484, ..., 0.961, 0.219, 0.642],\n", + " [0.426, 0.609, 0.388, ..., 0.356, 0.738, 0.189],\n", + " [0.078, 0.052, 0.799, ..., 0.351, 0.931, 0.367],\n", + " [0.01 , 0.289, 0.596, ..., 0.079, 0.232, 0.788],\n", + " [0.379, 0.876, 0.756, ..., 0.657, 0.705, 0.009]])\n", + "DA = array([[ 0.125, -0.021, -0.115, ..., 0.071, 0.087, -0.02 ],\n", + " [-0.117, -0.006, 0.015, ..., -0.067, -0.173, 0.128],\n", + " [-0.016, -0.011, -0.086, ..., 0.023, -0.098, 0.084],\n", + " [ 0.079, -0.009, -0.043, ..., 0.009, 0.2 , -0.018],\n", + " [-0.032, -0.083, -0.156, ..., 0.078, 0.15 , 0.095]])\n", + "DW = array([[ 0.105, 0.492, -0.066, -1.046, -0.738],\n", + " [ 0.01 , 0.154, 0.01 , 0.143, 0.034],\n", + " [-0.226, -0.296, -0.111, -0.165, -0.364],\n", + " [ 0. , 0. , 0. , 0. , 0. ],\n", + " [ 0.807, 1.033, 0.775, 1.211, 0.249],\n", + " [ 1.581, 0.881, -0.179, -0.173, 2.601],\n", + " [-3.297, -1.656, -2.533, -2.257, -0.84 ],\n", + " [ 3.512, 2.72 , 2.964, 3.378, 2.988],\n", + " [-0.446, -1.322, -0.458, 0.045, -1.621],\n", + " [-0.308, -0.465, -0.651, -0.403, -0.344]])\n", + "Db = array([[-0.515],\n", + " [ 0.18 ],\n", + " [-0.944],\n", + " [ 0. ],\n", + " [ 1.578],\n", + " [ 2.625],\n", + " [-2.491],\n", + " [ 5.879],\n", + " [-2.945],\n", + " [-1.429]])\n", + "14.119451790239486\n", + "[[1.38 2.197 2.494 ... 1.795 2.045 1.795]\n", + " [0. 0. 0. ... 0. 0. 0. ]\n", + " [1.427 1.575 1.479 ... 1.59 1.591 1.138]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0. ]\n", + " [4.879 5.873 5.865 ... 5.753 5.827 4.625]\n", + " [1.945 1.994 2.559 ... 2.092 2.926 2.013]]\n", + "[[1.255e+00 1.439e+00 1.983e+00 ... 1.828e+00 1.922e+00 1.765e+00]\n", + " [2.489e+01 3.212e+01 3.630e+01 ... 3.228e+01 3.263e+01 3.001e+01]\n", + " [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]\n", + " ...\n", + " [1.171e-01 2.276e-02 3.176e-01 ... 3.893e-01 3.276e-01 4.249e-01]\n", + " [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]\n", + " [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]]\n", + "[[1.159e-048 5.276e-061 3.066e-067 ... 1.433e-060 2.533e-061 3.216e-056]\n", + " [8.705e-251 2.685e-315 0.000e+000 ... 1.890e-315 4.238e-319 4.620e-294]\n", + " [1.000e+000 1.000e+000 1.000e+000 ... 1.000e+000 1.000e+000 1.000e+000]\n", + " ...\n", + " [3.906e-076 3.084e-095 8.970e-106 ... 2.514e-095 2.855e-096 5.366e-089]\n", + " [1.591e-057 4.628e-072 7.847e-080 ... 8.512e-072 5.219e-073 1.191e-066]\n", + " [4.286e-024 4.237e-030 7.514e-033 ... 1.199e-029 3.684e-030 2.544e-027]]\n", + "[[0. 1. 0. ... 0. 0. 0.]\n", + " [1. 0. 1. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]]\n", + "Softmax - deriv_loss\n", + "Y_hat = array([[1.159e-048, 5.276e-061, 3.066e-067, ..., 1.433e-060, 2.533e-061,\n", + " 3.216e-056],\n", + " [8.705e-251, 2.685e-315, 0.000e+000, ..., 1.890e-315, 4.238e-319,\n", + " 4.620e-294],\n", + " [1.000e+000, 1.000e+000, 1.000e+000, ..., 1.000e+000, 1.000e+000,\n", + " 1.000e+000],\n", + " ...,\n", + " [3.906e-076, 3.084e-095, 8.970e-106, ..., 2.514e-095, 2.855e-096,\n", + " 5.366e-089],\n", + " [1.591e-057, 4.628e-072, 7.847e-080, ..., 8.512e-072, 5.219e-073,\n", + " 1.191e-066],\n", + " [4.286e-024, 4.237e-030, 7.514e-033, ..., 1.199e-029, 3.684e-030,\n", + " 2.544e-027]])\n", + "DZ = array([[ 1.159e-048, -1.000e+000, 3.066e-067, ..., 1.433e-060,\n", + " 2.533e-061, 3.216e-056],\n", + " [-1.000e+000, 2.685e-315, -1.000e+000, ..., 1.890e-315,\n", + " 4.238e-319, 4.620e-294],\n", + " [ 1.000e+000, 1.000e+000, 1.000e+000, ..., 1.000e+000,\n", + " 1.000e+000, 1.000e+000],\n", + " ...,\n", + " [ 3.906e-076, 3.084e-095, 8.970e-106, ..., 2.514e-095,\n", + " 2.855e-096, 5.366e-089],\n", + " [ 1.591e-057, 4.628e-072, 7.847e-080, ..., 8.512e-072,\n", + " 5.219e-073, 1.191e-066],\n", + " [ 4.286e-024, 4.237e-030, 7.514e-033, ..., 1.199e-029,\n", + " 3.684e-030, 2.544e-027]])\n", + "layer_index = 2, type(layer) = \n", + "A = array([[1.255e+00, 1.439e+00, 1.983e+00, ..., 1.828e+00, 1.922e+00,\n", + " 1.765e+00],\n", + " [2.489e+01, 3.212e+01, 3.630e+01, ..., 3.228e+01, 3.263e+01,\n", + " 3.001e+01],\n", + " [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " ...,\n", + " [1.171e-01, 2.276e-02, 3.176e-01, ..., 3.893e-01, 3.276e-01,\n", + " 4.249e-01],\n", + " [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00]])\n", + "DA = array([[-0.509, -0.778, -0.509, ..., -0.253, -0.492, -0.719],\n", + " [19.512, 3.579, 19.512, ..., 5.262, 1.486, 11.096],\n", + " [34.099, 6.886, 34.099, ..., 10.282, -0.285, 18.375],\n", + " ...,\n", + " [ 0.474, -0.129, 0.474, ..., 0.733, 0.12 , 0.38 ],\n", + " [ 6.006, 0.579, 6.006, ..., 2.572, 1.383, 4.098],\n", + " [29.237, 5.222, 29.237, ..., 9.313, 2.812, 18.066]])\n", + "DW = array([[-8.780e+001, -1.575e+003, 0.000e+000, 2.774e-069, -7.408e+001,\n", + " -8.529e+001, -1.476e+003, -1.607e+001, 0.000e+000, 0.000e+000],\n", + " [-8.726e+001, -1.528e+003, 0.000e+000, 0.000e+000, -7.537e+001,\n", + " -8.497e+001, -1.427e+003, -1.732e+001, 0.000e+000, 0.000e+000],\n", + " [ 7.865e+002, 1.412e+004, 0.000e+000, 6.962e-002, 6.385e+002,\n", + " 7.447e+002, 1.334e+004, 1.420e+002, 0.000e+000, 0.000e+000],\n", + " [-9.299e+001, -1.661e+003, 0.000e+000, 2.265e-030, -7.296e+001,\n", + " -8.529e+001, -1.590e+003, -1.675e+001, 0.000e+000, 0.000e+000],\n", + " [-7.513e+001, -1.377e+003, 0.000e+000, 4.231e-215, -5.695e+001,\n", + " -6.918e+001, -1.314e+003, -1.274e+001, 0.000e+000, 0.000e+000],\n", + " [-8.497e+001, -1.523e+003, 0.000e+000, 2.929e-203, -6.677e+001,\n", + " -7.867e+001, -1.449e+003, -1.543e+001, 0.000e+000, 0.000e+000],\n", + " [-8.505e+001, -1.515e+003, 0.000e+000, 4.348e-106, -6.977e+001,\n", + " -8.070e+001, -1.431e+003, -1.590e+001, 0.000e+000, 0.000e+000],\n", + " [-9.345e+001, -1.688e+003, 0.000e+000, 1.010e-109, -7.592e+001,\n", + " -8.875e+001, -1.597e+003, -1.603e+001, 0.000e+000, 0.000e+000],\n", + " [-7.275e+001, -1.267e+003, 0.000e+000, 2.319e-082, -6.343e+001,\n", + " -7.103e+001, -1.183e+003, -1.422e+001, 0.000e+000, 0.000e+000],\n", + " [-1.071e+002, -1.982e+003, 0.000e+000, -6.962e-002, -8.320e+001,\n", + " -1.008e+002, -1.874e+003, -1.759e+001, 0.000e+000, 0.000e+000]])\n", + "Db = array([[-48.],\n", + " [-47.],\n", + " [438.],\n", + " [-54.],\n", + " [-43.],\n", + " [-49.],\n", + " [-47.],\n", + " [-52.],\n", + " [-38.],\n", + " [-60.]])\n", + "layer_index = 3, type(layer) = \n", + "Z = array([[ 1.255e+00, 1.439e+00, 1.983e+00, ..., 1.828e+00, 1.922e+00,\n", + " 1.765e+00],\n", + " [ 2.489e+01, 3.212e+01, 3.630e+01, ..., 3.228e+01, 3.263e+01,\n", + " 3.001e+01],\n", + " [-1.015e+02, -1.285e+02, -1.463e+02, ..., -1.311e+02, -1.337e+02,\n", + " -1.213e+02],\n", + " ...,\n", + " [ 1.171e-01, 2.276e-02, 3.176e-01, ..., 3.893e-01, 3.276e-01,\n", + " 4.249e-01],\n", + " [-1.188e+01, -1.565e+01, -1.842e+01, ..., -1.630e+01, -1.610e+01,\n", + " -1.549e+01],\n", + " [-2.346e+01, -3.068e+01, -3.116e+01, ..., -2.684e+01, -2.840e+01,\n", + " -2.414e+01]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[ True, True, True, ..., True, True, True],\n", + " [ True, True, True, ..., True, True, True],\n", + " [False, False, False, ..., False, False, False],\n", + " ...,\n", + " [ True, True, True, ..., True, True, True],\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False]])\n", + "DZ = array([[-0.509, -0.778, -0.509, ..., -0.253, -0.492, -0.719],\n", + " [19.512, 3.579, 19.512, ..., 5.262, 1.486, 11.096],\n", + " [ 0. , 0. , 0. , ..., 0. , -0. , 0. ],\n", + " ...,\n", + " [ 0.474, -0.129, 0.474, ..., 0.733, 0.12 , 0.38 ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]])\n", + "layer_index = 4, type(layer) = \n", + "A = array([[1.38 , 2.197, 2.494, ..., 1.795, 2.045, 1.795],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [1.427, 1.575, 1.479, ..., 1.59 , 1.591, 1.138],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [4.879, 5.873, 5.865, ..., 5.753, 5.827, 4.625],\n", + " [1.945, 1.994, 2.559, ..., 2.092, 2.926, 2.013]])\n", + "DA = array([[ 73.377, 14.616, 73.377, ..., 21.115, 6.444, 41.105],\n", + " [ -5.26 , -0.786, -5.26 , ..., -1.453, -0.31 , -2.823],\n", + " [ -7.064, -0.989, -7.064, ..., -2.028, -0.284, -3.964],\n", + " ...,\n", + " [ 16.047, 3.238, 16.047, ..., 4.146, 1.268, 9.268],\n", + " [ 47.553, 9.488, 47.553, ..., 14.144, 4.283, 26.383],\n", + " [-12.264, -2.484, -12.264, ..., -3.103, -1.132, -7.089]])\n", + "DW = array([[-3.822e+02, 0.000e+00, -2.639e+02, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -1.509e+03, 0.000e+00, -1.019e+03, -4.371e+02],\n", + " [ 6.056e+03, 0.000e+00, 4.210e+03, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 2.418e+04, 0.000e+00, 1.622e+04, 7.019e+03],\n", + " [ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [ 1.305e-01, 0.000e+00, 7.926e-02, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 7.373e-01, 0.000e+00, 3.423e-01, 1.625e-01],\n", + " [-2.693e+02, 0.000e+00, -1.863e+02, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -1.080e+03, 0.000e+00, -7.207e+02, -3.143e+02],\n", + " [-2.655e+02, 0.000e+00, -1.836e+02, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, -1.045e+03, 0.000e+00, -7.074e+02, -3.023e+02],\n", + " [ 5.604e+02, 0.000e+00, 3.870e+02, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 2.233e+03, 0.000e+00, 1.494e+03, 6.456e+02],\n", + " [ 2.980e+02, 0.000e+00, 2.109e+02, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 1.219e+03, 0.000e+00, 8.109e+02, 3.530e+02],\n", + " [ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [ 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00]])\n", + "Db = array([[-1.878e+02],\n", + " [ 2.993e+03],\n", + " [ 0.000e+00],\n", + " [ 6.307e-02],\n", + " [-1.338e+02],\n", + " [-1.302e+02],\n", + " [ 2.742e+02],\n", + " [ 1.484e+02],\n", + " [ 0.000e+00],\n", + " [ 0.000e+00]])\n", + "layer_index = 5, type(layer) = \n", + "Z = array([[ 1.38 , 2.197, 2.494, ..., 1.795, 2.045, 1.795],\n", + " [ -0.662, -0.761, -1.04 , ..., -1.058, -0.995, -0.729],\n", + " [ 1.427, 1.575, 1.479, ..., 1.59 , 1.591, 1.138],\n", + " ...,\n", + " [-10.121, -13.166, -15.021, ..., -13.302, -14.044, -12.007],\n", + " [ 4.879, 5.873, 5.865, ..., 5.753, 5.827, 4.625],\n", + " [ 1.945, 1.994, 2.559, ..., 2.092, 2.926, 2.013]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[ True, True, True, ..., True, True, True],\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, True, True, ..., True, True, True],\n", + " ...,\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, True, True, ..., True, True, True],\n", + " [ True, True, True, ..., True, True, True]])\n", + "DZ = array([[ 73.377, 14.616, 73.377, ..., 21.115, 6.444, 41.105],\n", + " [ -0. , -0. , -0. , ..., -0. , -0. , -0. ],\n", + " [ -7.064, -0.989, -7.064, ..., -2.028, -0.284, -3.964],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 47.553, 9.488, 47.553, ..., 14.144, 4.283, 26.383],\n", + " [-12.264, -2.484, -12.264, ..., -3.103, -1.132, -7.089]])\n", + "layer_index = 6, type(layer) = \n", + "A = array([[0.51 , 0.547, 0.484, ..., 0.961, 0.219, 0.642],\n", + " [0.426, 0.609, 0.388, ..., 0.356, 0.738, 0.189],\n", + " [0.078, 0.052, 0.799, ..., 0.351, 0.931, 0.367],\n", + " [0.01 , 0.289, 0.596, ..., 0.079, 0.232, 0.788],\n", + " [0.379, 0.876, 0.756, ..., 0.657, 0.705, 0.009]])\n", + "DA = array([[146.393, 26.174, 146.393, ..., 41.25 , 10.847, 82.374],\n", + " [ 64.888, 11.888, 64.888, ..., 18.878, 5.054, 36.2 ],\n", + " [100.012, 17.55 , 100.012, ..., 28.431, 7.133, 56.144],\n", + " [169.216, 31.358, 169.216, ..., 47.913, 13.229, 95.14 ],\n", + " [174.569, 34.21 , 174.569, ..., 50.61 , 15.084, 97.554]])\n", + "DW = array([[5762.006, 5828.048, 5800.11 , 5934.795, 5401.454],\n", + " [ 0. , 0. , 0. , 0. , 0. ],\n", + " [-532.94 , -536.532, -536.606, -544.788, -498.021],\n", + " [ 0. , 0. , 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , 0. , 0. ],\n", + " [2187.566, 2208.906, 2201.59 , 2246.633, 2045.877],\n", + " [ 0. , 0. , 0. , 0. , 0. ],\n", + " [3762.651, 3806.036, 3790.114, 3874.75 , 3528.146],\n", + " [-937.336, -949.687, -940.989, -966.983, -875.802]])\n", + "Db = array([[11545.856],\n", + " [ 0. ],\n", + " [-1064.969],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 4379.879],\n", + " [ 0. ],\n", + " [ 7537.423],\n", + " [-1880.977]])\n", + "14.183924172843321\n", + "[[ 0. 0. 0. ... 0. 0. 0. ]\n", + " [ 0. 0. 0. ... 0. 0. 0. ]\n", + " [1803.079 2306.146 2662.136 ... 2328.363 2556.222 2139.582]\n", + " ...\n", + " [ 0. 0. 0. ... 0. 0. 0. ]\n", + " [ 0. 0. 0. ... 0. 0. 0. ]\n", + " [3180.995 4069.157 4695.306 ... 4104.123 4507.743 3778.07 ]]\n", + "[[1.867e+06 2.388e+06 2.755e+06 ... 2.409e+06 2.646e+06 2.217e+06]\n", + " [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]\n", + " [4.689e+02 6.028e+02 6.972e+02 ... 6.081e+02 6.689e+02 5.588e+02]\n", + " ...\n", + " [0.000e+00 0.000e+00 0.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]\n", + " [2.360e+03 3.019e+03 3.485e+03 ... 3.046e+03 3.345e+03 2.803e+03]\n", + " [8.346e+02 1.069e+03 1.235e+03 ... 1.078e+03 1.185e+03 9.932e+02]]\n", + "[[0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [1. 1. 1. ... 1. 1. 1.]]\n", + "[[0. 1. 0. ... 0. 0. 0.]\n", + " [1. 0. 1. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]]\n", + "Softmax - deriv_loss\n", + "Y_hat = array([[0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [1., 1., 1., ..., 1., 1., 1.]])\n", + "DZ = array([[ 0., -1., 0., ..., 0., 0., 0.],\n", + " [-1., 0., -1., ..., 0., 0., 0.],\n", + " [ 0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [ 0., 0., 0., ..., 0., 0., 0.],\n", + " [ 0., 0., 0., ..., 0., 0., 0.],\n", + " [ 1., 1., 1., ..., 1., 1., 1.]])\n", + "layer_index = 2, type(layer) = \n", + "A = array([[1.867e+06, 2.388e+06, 2.755e+06, ..., 2.409e+06, 2.646e+06,\n", + " 2.217e+06],\n", + " [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [4.689e+02, 6.028e+02, 6.972e+02, ..., 6.081e+02, 6.689e+02,\n", + " 5.588e+02],\n", + " ...,\n", + " [0.000e+00, 0.000e+00, 0.000e+00, ..., 0.000e+00, 0.000e+00,\n", + " 0.000e+00],\n", + " [2.360e+03, 3.019e+03, 3.485e+03, ..., 3.046e+03, 3.345e+03,\n", + " 2.803e+03],\n", + " [8.346e+02, 1.069e+03, 1.235e+03, ..., 1.078e+03, 1.185e+03,\n", + " 9.932e+02]])\n", + "DA = array([[ 2.013e+01, 1.932e+01, 2.013e+01, ..., 2.260e+01, 1.441e+01,\n", + " 2.220e+01],\n", + " [ 4.715e+02, 4.086e+02, 4.715e+02, ..., 4.704e+02, 3.201e+02,\n", + " 4.686e+02],\n", + " [ 3.002e+01, 2.812e+00, 3.002e+01, ..., 6.208e+00, -4.359e+00,\n", + " 1.430e+01],\n", + " ...,\n", + " [ 4.262e-01, 1.079e+00, 4.262e-01, ..., 2.113e+00, 6.493e-01,\n", + " 2.223e+00],\n", + " [ 5.198e+00, -2.293e-01, 5.198e+00, ..., 1.764e+00, 5.746e-01,\n", + " 3.290e+00],\n", + " [ 2.800e+01, 3.980e+00, 2.800e+01, ..., 8.071e+00, 1.570e+00,\n", + " 1.682e+01]])\n", + "DW = array([[-1.205e+08, 0.000e+00, -3.045e+04, -3.673e+04, -8.629e+07,\n", + " -8.350e+07, 0.000e+00, 0.000e+00, -1.524e+05, -5.401e+04],\n", + " [-1.173e+08, 0.000e+00, -2.964e+04, -3.575e+04, -8.400e+07,\n", + " -8.128e+07, 0.000e+00, 0.000e+00, -1.484e+05, -5.257e+04],\n", + " [-1.532e+08, 0.000e+00, -3.870e+04, -4.668e+04, -1.097e+08,\n", + " -1.061e+08, 0.000e+00, 0.000e+00, -1.937e+05, -6.864e+04],\n", + " [-1.288e+08, 0.000e+00, -3.252e+04, -3.926e+04, -9.223e+07,\n", + " -8.925e+07, 0.000e+00, 0.000e+00, -1.629e+05, -5.772e+04],\n", + " [-1.051e+08, 0.000e+00, -2.653e+04, -3.201e+04, -7.522e+07,\n", + " -7.278e+07, 0.000e+00, 0.000e+00, -1.329e+05, -4.705e+04],\n", + " [-1.170e+08, 0.000e+00, -2.955e+04, -3.567e+04, -8.380e+07,\n", + " -8.109e+07, 0.000e+00, 0.000e+00, -1.480e+05, -5.244e+04],\n", + " [-1.167e+08, 0.000e+00, -2.947e+04, -3.555e+04, -8.352e+07,\n", + " -8.082e+07, 0.000e+00, 0.000e+00, -1.475e+05, -5.227e+04],\n", + " [-1.298e+08, 0.000e+00, -3.280e+04, -3.956e+04, -9.294e+07,\n", + " -8.993e+07, 0.000e+00, 0.000e+00, -1.642e+05, -5.817e+04],\n", + " [-9.793e+07, 0.000e+00, -2.475e+04, -2.984e+04, -7.011e+07,\n", + " -6.784e+07, 0.000e+00, 0.000e+00, -1.238e+05, -4.388e+04],\n", + " [ 1.086e+09, 0.000e+00, 2.744e+05, 3.310e+05, 7.778e+08,\n", + " 7.526e+08, 0.000e+00, 0.000e+00, 1.374e+06, 4.867e+05]])\n", + "Db = array([[-48.],\n", + " [-47.],\n", + " [-62.],\n", + " [-54.],\n", + " [-43.],\n", + " [-49.],\n", + " [-47.],\n", + " [-52.],\n", + " [-38.],\n", + " [440.]])\n", + "layer_index = 3, type(layer) = \n", + "Z = array([[ 1.867e+06, 2.388e+06, 2.755e+06, ..., 2.409e+06, 2.646e+06,\n", + " 2.217e+06],\n", + " [-2.992e+07, -3.827e+07, -4.417e+07, ..., -3.861e+07, -4.241e+07,\n", + " -3.553e+07],\n", + " [ 4.689e+02, 6.028e+02, 6.972e+02, ..., 6.081e+02, 6.689e+02,\n", + " 5.588e+02],\n", + " ...,\n", + " [-1.503e+06, -1.923e+06, -2.219e+06, ..., -1.940e+06, -2.130e+06,\n", + " -1.785e+06],\n", + " [ 2.360e+03, 3.019e+03, 3.485e+03, ..., 3.046e+03, 3.345e+03,\n", + " 2.803e+03],\n", + " [ 8.346e+02, 1.069e+03, 1.235e+03, ..., 1.078e+03, 1.185e+03,\n", + " 9.932e+02]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[ True, True, True, ..., True, True, True],\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, True, True, ..., True, True, True],\n", + " ...,\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, True, True, ..., True, True, True],\n", + " [ True, True, True, ..., True, True, True]])\n", + "DZ = array([[20.128, 19.318, 20.128, ..., 22.595, 14.408, 22.203],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [30.025, 2.812, 30.025, ..., 6.208, -4.359, 14.3 ],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 5.198, -0.229, 5.198, ..., 1.764, 0.575, 3.29 ],\n", + " [27.996, 3.98 , 27.996, ..., 8.071, 1.57 , 16.825]])\n", + "layer_index = 4, type(layer) = \n", + "A = array([[ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [1803.079, 2306.146, 2662.136, ..., 2328.363, 2556.222, 2139.582],\n", + " ...,\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [3180.995, 4069.157, 4695.306, ..., 4104.123, 4507.743, 3778.07 ]])\n", + "DA = array([[ 1.321e+04, 1.378e+04, 1.321e+04, ..., 1.748e+04, 1.239e+04,\n", + " 1.841e+04],\n", + " [ 4.206e-01, -1.005e+01, 4.206e-01, ..., -1.210e+01, -1.222e+01,\n", + " -1.033e+01],\n", + " [ 9.486e+03, 9.561e+03, 9.486e+03, ..., 1.216e+04, 8.533e+03,\n", + " 1.291e+04],\n", + " ...,\n", + " [-1.134e+02, -1.990e+01, -1.134e+02, ..., -3.487e+01, 4.643e+00,\n", + " -6.338e+01],\n", + " [ 3.634e+04, 3.690e+04, 3.634e+04, ..., 4.690e+04, 3.299e+04,\n", + " 4.972e+04],\n", + " [ 1.578e+04, 1.589e+04, 1.578e+04, ..., 2.022e+04, 1.418e+04,\n", + " 2.149e+04]])\n", + "DW = array([[0.000e+00, 0.000e+00, 1.522e+08, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 2.685e+08],\n", + " [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [0.000e+00, 0.000e+00, 7.770e+06, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 1.371e+07],\n", + " [0.000e+00, 0.000e+00, 4.187e+05, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 7.390e+05],\n", + " [0.000e+00, 0.000e+00, 1.186e+08, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 2.092e+08],\n", + " [0.000e+00, 0.000e+00, 1.431e+08, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 2.525e+08],\n", + " [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 0.000e+00],\n", + " [0.000e+00, 0.000e+00, 1.722e+06, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 3.039e+06],\n", + " [0.000e+00, 0.000e+00, 9.531e+06, 0.000e+00, 0.000e+00, 0.000e+00,\n", + " 0.000e+00, 0.000e+00, 0.000e+00, 1.682e+07]])\n", + "Db = array([[63743.607],\n", + " [ 0. ],\n", + " [ 3252.695],\n", + " [ 175.413],\n", + " [49691.379],\n", + " [59962.643],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 724.818],\n", + " [ 4004.355]])\n", + "layer_index = 5, type(layer) = \n", + "Z = array([[-1.953e+04, -2.499e+04, -2.884e+04, ..., -2.521e+04, -2.769e+04,\n", + " -2.319e+04],\n", + " [-6.625e-01, -7.611e-01, -1.040e+00, ..., -1.058e+00, -9.953e-01,\n", + " -7.290e-01],\n", + " [ 1.803e+03, 2.306e+03, 2.662e+03, ..., 2.328e+03, 2.556e+03,\n", + " 2.140e+03],\n", + " ...,\n", + " [-1.012e+01, -1.317e+01, -1.502e+01, ..., -1.330e+01, -1.404e+01,\n", + " -1.201e+01],\n", + " [-1.275e+04, -1.631e+04, -1.883e+04, ..., -1.646e+04, -1.808e+04,\n", + " -1.514e+04],\n", + " [ 3.181e+03, 4.069e+03, 4.695e+03, ..., 4.104e+03, 4.508e+03,\n", + " 3.778e+03]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, True, True, ..., True, True, True],\n", + " ...,\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [ True, True, True, ..., True, True, True]])\n", + "DZ = array([[ 0. , 0. , 0. , ..., 0. , 0. ,\n", + " 0. ],\n", + " [ 0. , -0. , 0. , ..., -0. , -0. ,\n", + " -0. ],\n", + " [ 9485.733, 9561.338, 9485.733, ..., 12159.372, 8532.786,\n", + " 12910.332],\n", + " ...,\n", + " [ -0. , -0. , -0. , ..., -0. , 0. ,\n", + " -0. ],\n", + " [ 0. , 0. , 0. , ..., 0. , 0. ,\n", + " 0. ],\n", + " [15783.008, 15886.483, 15783.008, ..., 20219.977, 14182.129,\n", + " 21490.405]])\n", + "layer_index = 6, type(layer) = \n", + "A = array([[0.51 , 0.547, 0.484, ..., 0.961, 0.219, 0.642],\n", + " [0.426, 0.609, 0.388, ..., 0.356, 0.738, 0.189],\n", + " [0.078, 0.052, 0.799, ..., 0.351, 0.931, 0.367],\n", + " [0.01 , 0.289, 0.596, ..., 0.079, 0.232, 0.788],\n", + " [0.379, 0.876, 0.756, ..., 0.657, 0.705, 0.009]])\n", + "DA = array([[19848596.008, 19985877.716, 19848596.008, ..., 25432217.372,\n", + " 17840245.301, 27023192.393],\n", + " [20088691.146, 20227596.001, 20088691.146, ..., 25739834.339,\n", + " 18056021.093, 27350089.44 ],\n", + " [19957263.245, 20095305.511, 19957263.245, ..., 25571458.524,\n", + " 17937923.352, 27171135.418],\n", + " [20426505.393, 20567729.688, 20426505.393, ..., 26172670.469,\n", + " 18359642.432, 27810019.228],\n", + " [18550075.335, 18678377.266, 18550075.335, ..., 23768409.721,\n", + " 16673114.452, 25255299.827]])\n", + "DW = array([[ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [18071476.294, 18568754.541, 18442309.057, 18690923.299,\n", + " 18577123.322],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [ 0. , 0. , 0. , 0. ,\n", + " 0. ],\n", + " [30065950.362, 30893119.393, 30682706.754, 31096260.652,\n", + " 30907295.574]])\n", + "Db = array([[ 0. ],\n", + " [ 0. ],\n", + " [37056138.956],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [ 0. ],\n", + " [61651090.299]])\n", + "14.377341321203978\n", + "[[0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]]\n", + "[[0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]]\n", + "[[4.533e-009 4.533e-009 4.533e-009 ... 4.533e-009 4.533e-009 4.533e-009]\n", + " [1.040e-030 1.040e-030 1.040e-030 ... 1.040e-030 1.040e-030 1.040e-030]\n", + " [3.267e-209 3.267e-209 3.267e-209 ... 3.267e-209 3.267e-209 3.267e-209]\n", + " ...\n", + " [5.519e-010 5.519e-010 5.519e-010 ... 5.519e-010 5.519e-010 5.519e-010]\n", + " [9.329e-018 9.329e-018 9.329e-018 ... 9.329e-018 9.329e-018 9.329e-018]\n", + " [9.308e-213 9.308e-213 9.308e-213 ... 9.308e-213 9.308e-213 9.308e-213]]\n", + "[[0. 1. 0. ... 0. 0. 0.]\n", + " [1. 0. 1. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " ...\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]\n", + " [0. 0. 0. ... 0. 0. 0.]]\n", + "Softmax - deriv_loss\n", + "Y_hat = array([[4.533e-009, 4.533e-009, 4.533e-009, ..., 4.533e-009, 4.533e-009,\n", + " 4.533e-009],\n", + " [1.040e-030, 1.040e-030, 1.040e-030, ..., 1.040e-030, 1.040e-030,\n", + " 1.040e-030],\n", + " [3.267e-209, 3.267e-209, 3.267e-209, ..., 3.267e-209, 3.267e-209,\n", + " 3.267e-209],\n", + " ...,\n", + " [5.519e-010, 5.519e-010, 5.519e-010, ..., 5.519e-010, 5.519e-010,\n", + " 5.519e-010],\n", + " [9.329e-018, 9.329e-018, 9.329e-018, ..., 9.329e-018, 9.329e-018,\n", + " 9.329e-018],\n", + " [9.308e-213, 9.308e-213, 9.308e-213, ..., 9.308e-213, 9.308e-213,\n", + " 9.308e-213]])\n", + "DZ = array([[ 4.533e-009, -1.000e+000, 4.533e-009, ..., 4.533e-009,\n", + " 4.533e-009, 4.533e-009],\n", + " [-1.000e+000, 1.040e-030, -1.000e+000, ..., 1.040e-030,\n", + " 1.040e-030, 1.040e-030],\n", + " [ 3.267e-209, 3.267e-209, 3.267e-209, ..., 3.267e-209,\n", + " 3.267e-209, 3.267e-209],\n", + " ...,\n", + " [ 5.519e-010, 5.519e-010, 5.519e-010, ..., 5.519e-010,\n", + " 5.519e-010, 5.519e-010],\n", + " [ 9.329e-018, 9.329e-018, 9.329e-018, ..., 9.329e-018,\n", + " 9.329e-018, 9.329e-018],\n", + " [ 9.308e-213, 9.308e-213, 9.308e-213, ..., 9.308e-213,\n", + " 9.308e-213, 9.308e-213]])\n", + "layer_index = 2, type(layer) = \n", + "A = array([[0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])\n", + "DA = array([[ 1.150e+07, 8.294e+06, 1.150e+07, ..., 1.216e+07, -3.705e-02,\n", + " 1.178e+07],\n", + " [ 1.514e+02, 8.843e+01, 1.514e+02, ..., 1.502e+02, -3.889e-07,\n", + " 1.484e+02],\n", + " [ 2.915e+03, 2.074e+03, 2.915e+03, ..., 3.062e+03, -9.257e-06,\n", + " 2.992e+03],\n", + " ...,\n", + " [-2.230e-01, 4.293e-01, -2.230e-01, ..., 1.463e+00, -2.598e-09,\n", + " 1.573e+00],\n", + " [ 1.455e+04, 1.049e+04, 1.455e+04, ..., 1.539e+04, -4.684e-05,\n", + " 1.490e+04],\n", + " [ 5.174e+03, 3.709e+03, 5.174e+03, ..., 5.457e+03, -1.657e-05,\n", + " 5.298e+03]])\n", + "DW = array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])\n", + "Db = array([[-48.],\n", + " [-47.],\n", + " [-62.],\n", + " [446.],\n", + " [-43.],\n", + " [-49.],\n", + " [-47.],\n", + " [-52.],\n", + " [-38.],\n", + " [-60.]])\n", + "layer_index = 3, type(layer) = \n", + "Z = array([[-63556.025, -63556.025, -63556.025, ..., -63556.025, -63556.025,\n", + " -63556.025],\n", + " [ -2990.354, -2990.354, -2990.354, ..., -2990.354, -2990.354,\n", + " -2990.354],\n", + " [ -3263.41 , -3263.41 , -3263.41 , ..., -3263.41 , -3263.41 ,\n", + " -3263.41 ],\n", + " ...,\n", + " [ -148.587, -148.587, -148.587, ..., -148.587, -148.587,\n", + " -148.587],\n", + " [ -725.835, -725.835, -725.835, ..., -725.835, -725.835,\n", + " -725.835],\n", + " [ -4010.389, -4010.389, -4010.389, ..., -4010.389, -4010.389,\n", + " -4010.389]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " ...,\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False]])\n", + "DZ = array([[ 0., 0., 0., ..., 0., -0., 0.],\n", + " [ 0., 0., 0., ..., 0., -0., 0.],\n", + " [ 0., 0., 0., ..., 0., -0., 0.],\n", + " ...,\n", + " [-0., 0., -0., ..., 0., -0., 0.],\n", + " [ 0., 0., 0., ..., 0., -0., 0.],\n", + " [ 0., 0., 0., ..., 0., -0., 0.]])\n", + "layer_index = 4, type(layer) = \n", + "A = array([[0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])\n", + "DA = array([[0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])\n", + "DW = array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])\n", + "Db = array([[0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.]])\n", + "layer_index = 5, type(layer) = \n", + "Z = array([[-1.953e+04, -2.499e+04, -2.884e+04, ..., -2.521e+04, -2.769e+04,\n", + " -2.319e+04],\n", + " [-6.625e-01, -7.611e-01, -1.040e+00, ..., -1.058e+00, -9.953e-01,\n", + " -7.290e-01],\n", + " [-6.286e+07, -8.087e+07, -9.291e+07, ..., -8.119e+07, -8.931e+07,\n", + " -7.380e+07],\n", + " ...,\n", + " [-1.012e+01, -1.317e+01, -1.502e+01, ..., -1.330e+01, -1.404e+01,\n", + " -1.201e+01],\n", + " [-1.275e+04, -1.631e+04, -1.883e+04, ..., -1.646e+04, -1.808e+04,\n", + " -1.514e+04],\n", + " [-1.046e+08, -1.345e+08, -1.546e+08, ..., -1.351e+08, -1.486e+08,\n", + " -1.228e+08]])\n", + "calculating DZ\n", + "Rectified Linear Unit - deriv_loss\n", + "dZ = array([[False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " ...,\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False],\n", + " [False, False, False, ..., False, False, False]])\n", + "DZ = array([[0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " ...,\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])\n", + "layer_index = 6, type(layer) = \n", + "A = array([[0.51 , 0.547, 0.484, ..., 0.961, 0.219, 0.642],\n", + " [0.426, 0.609, 0.388, ..., 0.356, 0.738, 0.189],\n", + " [0.078, 0.052, 0.799, ..., 0.351, 0.931, 0.367],\n", + " [0.01 , 0.289, 0.596, ..., 0.079, 0.232, 0.788],\n", + " [0.379, 0.876, 0.756, ..., 0.657, 0.705, 0.009]])\n", + "DA = array([[0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.],\n", + " [0., 0., 0., ..., 0., 0., 0.]])\n", + "DW = array([[0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.],\n", + " [0., 0., 0., 0., 0.]])\n", + "Db = array([[0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.],\n", + " [0.]])\n" + ] + } + ], + "source": [ + "from loss_functions import CrossEntropy\n", + "from activation_functions import ActivationLayer, Softmax, ReLU\n", + "from weights import Transformation\n", + "from network import Network\n", + "\n", + "import numpy as np\n", + "np.set_printoptions(precision=3)\n", + "\n", + "\n", + "m = 500 # number of examples\n", + "n = 10 # number of labels\n", + "x = 5 # datapoints per example\n", + "Y = np.zeros((n,m))\n", + "Y[np.random.randint(low=n,size=m),range(m)] = 1\n", + "X = np.random.rand(x,m)\n", + "\n", + "print(f\"{Y.shape = }\")\n", + "print(f\"{X.shape = }\")\n", + "\n", + "\n", + "\n", + "network = Network(data = X, Y=Y, Loss = CrossEntropy)\n", + "network.add_layer(ReLU, 10)\n", + "network.add_layer(ReLU, 10)\n", + "network.add_layer(Softmax, n)\n", + "network.initialize()\n", + "\n", + "lr = 1\n", + "for _ in range(4):\n", + " vals, loss = network.forward_pass()\n", + " print(loss)\n", + " # for val in vals:\n", + " # print(val)\n", + " # print(vals[-9])\n", + " # print(vals[-7])\n", + " print(vals[-5])\n", + " print(vals[-3])\n", + " print(vals[-1])\n", + " print(Y)\n", + " dWs, dbs = network.backwards_propagation(vals)\n", + " network.update(lr=lr, dWs = dWs, dbs = dbs)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'n1' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32mc:\\Users\\lukes\\OneDrive\\Documents\\coding\\nn_from_scratch\\src\\lukestest.ipynb Cell 4\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[0m \u001b[39mprint\u001b[39m(n1)\n\u001b[0;32m 2\u001b[0m \u001b[39mprint\u001b[39m(n2)\n\u001b[0;32m 4\u001b[0m \u001b[39mdef\u001b[39;00m \u001b[39mmy_einsum\u001b[39m(string_rep, \u001b[39m*\u001b[39moperands):\n", + "\u001b[1;31mNameError\u001b[0m: name 'n1' is not defined" + ] + } + ], + "source": [ + "\n", + "print(n1)\n", + "print(n2)\n", + "\n", + "def my_einsum(string_rep, *operands):\n", + " ops, out = string_rep.split('->')\n", + " ops = ops.split(',')\n", + " operands = list(zip(ops,operands))\n", + " keys = dict()\n", + " for operand in operands:\n", + " assert len(operand[0]) == len(operand[1].shape), f\"operand problem: {operand[0]}, {operand[1]}\"\n", + " shape = operand[1].shape\n", + " for index, char in enumerate(operand[0]):\n", + " if char in keys:\n", + " assert keys[char] == shape[index], f\"charkeys problem: {keys[char]}, {shape[index]}\"\n", + " else:\n", + " keys[char] = shape[index]\n", + "\n", + " \n", + " \n", + "\n", + "\n", + "\n", + "my_einsum(\"ij,jk->jk\",n1,n2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# print(n1)\n", + "# print()\n", + "# print()\n", + "# print(n2)\n", + "# print()\n", + "# print()\n", + "# print(np.einsum('ik,jk->ji',n1,n2))\n", + "# print()\n", + "# print()\n", + "\n", + "# result = np.zeros((n2.shape[0],n1.shape[0]))\n", + "# for i in range(n1.shape[0]):\n", + "# for k in range(n1.shape[1]):\n", + "# for j in range(n2.shape[0]):\n", + "# result[j,i] += n1[i,k] * n2[j,k]\n", + "# print(result)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[[0.31815146 0. 0. ]\n", + " [0. 0.35729241 0. ]\n", + " [0. 0. 0.32455613]]\n", + "\n", + " [[0.23422572 0. 0. ]\n", + " [0. 0.27828213 0. ]\n", + " [0. 0. 0.48749215]]]\n", + "[[[0.10122035 0.1136731 0.10325801]\n", + " [0.1136731 0.12765787 0.11596144]\n", + " [0.10325801 0.11596144 0.10533668]]\n", + "\n", + " [[0.05486169 0.06518083 0.1141832 ]\n", + " [0.06518083 0.07744094 0.13566035]\n", + " [0.1141832 0.13566035 0.23764859]]]\n" + ] + }, + { + "data": { + "text/plain": [ + "''" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "def activate(Z: np.array):\n", + " # collapses 1 dim of array\n", + " max_Z = np.amax(Z, 1).reshape(Z.shape[0],1) # Get the row-wise maximum\n", + " eZ = np.exp(Z - max_Z ) # For stability\n", + " return eZ / eZ.sum(axis=1, keepdims=True) \n", + "\n", + "\n", + "def deriv(Z):\n", + " softmax = activate(Z)\n", + " identity = np.eye(softmax.shape[-1])\n", + " t1 = np.zeros(softmax.shape+ (softmax.shape[-1],),dtype=np.float32)\n", + " t2 = np.zeros(softmax.shape+ (softmax.shape[-1],),dtype=np.float32)\n", + " t1 = np.einsum('ij,jk->ijk',softmax,identity)\n", + " print(t1)\n", + " t2 = np.einsum('ij,ik->ijk',softmax,softmax)\n", + " print(t2)\n", + " return t1-t2\n", + "\n", + "deriv(n1)\n", + "\"\"" + ] + }, { "cell_type": "code", "execution_count": 90, @@ -433,7 +1641,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.10 64-bit", + "display_name": "Python 3.10.4 ('nn_from_scratch-5wfMGXWy')", "language": "python", "name": "python3" }, @@ -447,12 +1655,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.5" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + "hash": "f5740e5a6f631708b57c54ec6c0e0c4902b40036b869c78f2b20fbf3952ee788" } } }, diff --git a/src/network.py b/src/network.py new file mode 100644 index 0000000..bab43ce --- /dev/null +++ b/src/network.py @@ -0,0 +1,138 @@ +""" +classes for a neural network +""" + +import numpy as np +from activation_functions import ActivationLayer + +from loss_functions import Loss +from weights import Transformation + + +class Network: + """ + Neural Network class + requires: + - layer_sizes = an array of ints representing number of neurons in each hidden layer + - Y = ... + """ + + initialized = False + + def __init__(self, data, Y, Loss, layer_descriptions=None): + """ + loss = a specific Loss object like CrossEntropy + layer_descriptions = an iterable of tuples (activation, ) + + """ + self.X = data + self.m = data.shape[-1] + self.Loss = Loss(Y=Y) + + self.layer_descriptions = [] + if layer_descriptions: + self.layer_descriptions = layer_descriptions + self.initialize() + + def add_layer(self, layer_type, size): + if self.initialized: + raise Exception("cannot add layer once initialized") + self.layer_descriptions.append((layer_type, size)) + + def remove_layer(self, n=-1): + if self.initialized: + raise Exception("cannot remove layer once initialized") + self.layer_descriptions.pop(n) + + def initialize(self): + if self.initialized: + raise Exception("already initialized") + if not self.layer_descriptions: + raise Exception("cannot initialize without layer descriptions") + + next_step = self.Loss + self.layers = [next_step] + for layer in self.layer_descriptions[::-1]: + current_layer = layer[0](size=layer[1], next_step=next_step) + if isinstance(next_step, Loss): + self.layers.append(current_layer) + next_step = current_layer + continue + transformation = Transformation( + prev_size=current_layer.size, next_step_size=next_step.size + ) + self.layers.append(transformation) + self.layers.append(current_layer) + next_step = current_layer + transformation = Transformation( + prev_size=self.X.shape[0], next_step_size=current_layer.size + ) + self.layers.append(transformation) + + self.initialized = True + + def forward_pass(self): + current = self.X + vals = [current] + for layer in self.layers[:0:-1]: + new_current = layer.apply(current) + vals.append(new_current) + current = new_current + loss = self.Loss.loss(new_current) + + return (vals, loss) + + def backwards_propagation(self, vals): + """ + vals[-1] = Y_hat + vals[-2] = Z_final + vals[-3] = A_final + ... + vals[3] = Z1 + vals[2] = A1 + vals[1] = Z0 + vals[0] = data + """ + DLs = [] + dWs = [] + dbs = [] + last_layer = self.layers[1] + Y_hat = vals[-1] + DZ = last_layer.deriv_loss(Y_hat) + DLs.append(DZ) + print(f"{Y_hat = }") + print(f"{DZ = }") + + for index, layer in enumerate(self.layers[2:]): + print(f"layer_index = {index+2}, {type(layer) = }") + if isinstance(layer, Transformation): + A = vals[-index - 3] + DA = layer.deriv_loss(DZ) + DW = layer.weights.deriv_loss(m=self.m, A=A, DZ=DZ) + Db = layer.biases.deriv_loss(m=self.m, DZ=DZ) + print(f"{A = }") + print(f"{DA = }") + print(f"{DW = }") + print(f"{Db = }") + DLs.append(DA) + dWs.append(DW) + dbs.append(Db) + + elif isinstance(layer, ActivationLayer): + Z = vals[-index - 3] + dZ = layer.deriv(Z) + print(f"{Z = }") + print("calculating DZ") + DZ = layer.deriv_loss(DA=DA, dZ=dZ) + print(f"{DZ = }") + DLs.append(DZ) + + return (dWs, dbs) + + def update(self, lr, dWs, dbs): + i = 0 + for layer in self.layers: + if isinstance(layer, Transformation): + layer.weights.W -= lr * dWs[i] + layer.biases.b -= lr * dbs[i] + i += 1 diff --git a/src/nn_model.ipynb b/src/nn_model.ipynb index c09a35d..43c4c66 100644 --- a/src/nn_model.ipynb +++ b/src/nn_model.ipynb @@ -373,7 +373,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.10.4 ('nn_from_scratch-5wfMGXWy')", "language": "python", "name": "python3" }, @@ -387,11 +387,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.5" }, "vscode": { "interpreter": { - "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" + "hash": "f5740e5a6f631708b57c54ec6c0e0c4902b40036b869c78f2b20fbf3952ee788" } } }, diff --git a/src/weights.py b/src/weights.py new file mode 100644 index 0000000..efbc0f8 --- /dev/null +++ b/src/weights.py @@ -0,0 +1,151 @@ +""" +Module for creating weights and biases for neural net +""" + +import numpy as np + + +class Weights: + """ + Weights applied between neural layers + Notation: input = A, and output = Zw (Z without Biases) + """ + + name = "Weights" + equation = "W dot A" + + def __init__(self, prev_size, next_size): + # self.W = np.random.randn(next_size, prev_size) + self.W = np.random.rand(next_size, prev_size) - 0.5 + + def apply(self, A): + """ + returns Zw from equation + Z = Zw + b = W dot A + b + """ + return np.dot(self.W, A) + + def deriv(self, A): + """ + finding dZ2/dw2{n,n}: + from (9) Z2{n,m} = W2{n,n} dot A1{n,m} + b2{n,1} + let i,j,k in range(n), dropping m for now + Z2[i] = W2[i]{n} dot A1{n} + b2[i] = {sum over j} W2[i][j] * A1[j] + b2[i] + dZ2[i]/dW2[j,k]{1} = 0 if i != j, else A1[k] + dZ2[i]/dW2[i,k]{1} = A1[k] + dZ2/dW2{n} = A1 + adding m back in: for l in range(m) + Z2[i,l]{1} = W2[i] dot A1[l] + b2[i] + dZ2[i,l]/dW2[i,k]{1} = A1[k,l] + dZ2[l]/dW2{n} = A1[l]{n} + dZ2/dW2{n,m} = A1{n,m} + This shows what you would multiply a delta_W with to get the difference in Z2 + had you added that delta_W to W2 and recalulated Z2 that way + """ + return A + + def deriv_loss(self, m, A, DZ): + """ + DW2{n,n} = DZ2 * dZ2/dW2 = DZ2{n} dot A1{n} + The derivative of the loss with respect to particular values of W2 + To bring m back in the picture, we have to average over all of the losses accrued + during the training run. Namely m training examples: + (15) DW2{n,n} = 1/m * DZ2 * dZ2/dw2 = 1/m * DZ2{n,m} dot A1{n,m}.T{m,n} + """ + return np.dot(DZ, A.T) # / m + + +class Biases: + """ + Biases applied between neural layers + Notation: input = Zw, and output = Z + """ + + name = "Biases" + equation = "Zw + b" + + def __init__(self, next_size): + # self.b = np.random.randn(next_size, 1) + self.b = np.random.rand(next_size, 1) - 0.5 + + def apply(self, Zw): + """ + returns Zw from equation + Z = Zw + b = W dot A + b + """ + return Zw + self.b + + def deriv(self): + """ + finding dZ2/dw2{n,n}: + from (9) Z2{n,m} = W2{n,n} dot A1{n,m} + b2{n,1} + let i,j,k in range(n), dropping m for now + Z2[i] = W2[i]{n} dot A1{n} + b2[i] = {sum over j} W2[i][j] * A1[j] + b2[i] + dZ2[i]/dW2[j,k]{1} = 0 if i != j, else A1[k] + dZ2[i]/dW2[i,k]{1} = A1[k] + dZ2/dW2{n} = A1 + adding m back in: for l in range(m) + Z2[i,l]{1} = W2[i] dot A1[l] + b2[i] + dZ2[i,l]/dW2[i,k]{1} = A1[k,l] + dZ2[l]/dW2{n} = A1[l]{n} + dZ2/dW2{n,m} = A1{n,m} + This shows what you would multiply a delta_W with to get the difference in Z2 + had you added that delta_W to W2 and recalulated Z2 that way + """ + return 1 + + def deriv_loss(self, m, DZ): + """ + Db2{n} = DZ2 * dZ2/db2 = DZ2{n} * 1{n} = dZ2{n} + The derivative of the loss with respect to particular values of b2 + To bring m back in the picture, we have to average over all of the losses accrued + during the training run. Namely m training examples: + (16) Db2{n} = 1/m * DZ2 * dZ2/dw2 = 1/m * 1{n} dot DZ2{n,m} = 1/m * np.sum(DZ2{n,m}) + """ + return np.sum(DZ, axis=1, keepdims=True) # / m + + +class Transformation: + """ + Transformer between output of previous layer (A) to input of next layer (Z) + + """ + + def __init__(self, prev_size, next_step_size): + self.weights = Weights(prev_size=prev_size, next_size=next_step_size) + self.biases = Biases(next_size=next_step_size) + + def apply(self, A): + """ + applies weights and biases to A to return Z + Z = W dot A + b + """ + return np.dot(self.weights.W, A) + self.biases.b + + def deriv(self): + """ + finding dZ2/dA1: + from (9) Z2{n,m} = W2{n,n} dot A1{n,m} + b2{n,1} + let i,j in range(n), dropping m for now + Z2[i] = W2[i]{n} dot A1{n} + b2[i] = {sum over j} W2[i][j] * A1[j] + b2[i] + dZ2[i]/dA1[j] = W2[j,i] + dZ2/dA1[j] = W2[j] + dZ2/dA1 = W2 + adding m back in: let k,l in range(m): + dZ2[:,k]/dA1[:,l] = 0 if l!= k, else W2 + dZ2[:,k]/dA1[:,k] = W2 + dZ2/dA1{n,n} = W2{n,n} + """ + return self.weights.W + + def deriv_loss(self, DZ): + """ + Dropping m again for a moment: + DA1{n} = DZ2 * dZ2/dA1 = DZ2{n} * W2{n,n} = W2.T{n,n} dot DZ2{n} + The derivative of loss with respect to a particular A1 value + Bringing m back in the picture is easy: + {17} DA1{n,m} = W2.T{n,n} dot DZ2{n,m} + + DZ = self.next_step.deriv_loss(self.apply(A)) + """ + return np.dot(self.weights.W.T, DZ)