From ec4afb6d6b4359d2656cde03c36e688cb744eff9 Mon Sep 17 00:00:00 2001 From: Rea Fernandes <zob06qih@rhrk.uni-kl.de> Date: Sun, 10 Dec 2023 00:16:58 +0100 Subject: [PATCH] Update exercise3.tex with 1.1 and 1.2 --- exercises/exercise3/exercise3.tex | 39 +++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/exercises/exercise3/exercise3.tex b/exercises/exercise3/exercise3.tex index 2d1b0a8..8886813 100644 --- a/exercises/exercise3/exercise3.tex +++ b/exercises/exercise3/exercise3.tex @@ -55,8 +55,47 @@ In the following, we want to derive the gradient of this loss function with resp \subsection{Using the multivariate chain rule, show that \\ $\frac{\partial L}{\partial W} = \sum^T_{t=1}\sum^t_{k=1}\frac{\partial L_t}{\partial h_t}\frac{\partial h_t}{\partial h_k}\frac{\partial h_k}{\partial W}$.} +Solution + +1. Start with the expression for $L$: +\[ +L = \sum_{t=1}^{T} L_t +\] + +2. Apply the chain rule for each $L_t$: +\[ +\frac{\partial L_t}{\partial W} = \sum_{k=1}^{t} \frac{\partial L_t}{\partial h_t} \frac{\partial h_t}{\partial h_k} \frac{\partial h_k}{\partial W} +\] + +3. Sum over all time steps $T$: +\[ +\frac{\partial L}{\partial W} = \sum_{t=1}^{T} \frac{\partial L_t}{\partial W} = \sum_{t=1}^{T} \sum_{k=1}^{t} \frac{\partial L_t}{\partial h_t} \frac{\partial h_t}{\partial h_k} \frac{\partial h_k}{\partial W} +\] + +Thus proved the derivation using the multivariate chain rule. + \subsection{Given a function $f(h) = \sigma(Wh)$ where $h \in \mathbb{R}^d$ and \\ $W \in \mathbb{R}^{n \times d}$. Here the sigmoid function $\sigma$ is applied element-wise on a vector. Show that \\ $\frac{\partial f}{\partial h} = \text{diag}(\sigma'(Wh))W \in \mathbb{R}^{n \times d}$, where $\frac{\partial f}{\partial h}$ denotes the Jacobian matrix of $f$ with respect to $h$, and diag$(\sigma'(Wh))$ is the diagonal matrix of the vector $\sigma'(Wh)$.} +Solution + +\begin{equation*} +\begin{aligned} + f(h) &= \sigma(Wh) \\ + \frac{\partial f}{\partial h} &= \text{diag}(\sigma'(Wh)) \cdot \frac{\partial}{\partial h}(\sigma(Wh)) \\ + &= \text{diag}(\sigma'(Wh)) \cdot \sigma(Wh) \cdot (1 - \sigma(Wh)) \cdot W \\ + &= \text{diag}(\sigma'(Wh)) \cdot \sigma(Wh) \cdot (\sigma(Wh) - 1) \cdot W \\ + &= \text{diag}(\sigma'(Wh)) \cdot (\sigma(Wh) - \sigma(Wh)\sigma(Wh)) \cdot W \\ + &= \text{diag}(\sigma'(Wh)) \cdot (\sigma(Wh) - \sigma(Wh)^2) \cdot W \\ + &= \text{diag}(\sigma'(Wh)) \cdot \sigma(Wh) \cdot (1 - \sigma(Wh)) \cdot W \\ + &= \text{diag}(\sigma'(Wh)) \cdot W +\end{aligned} +\end{equation*} + +Thus proved that +\begin{equation*} + \frac{\partial f}{\partial h} &= \text{diag}(\sigma'(Wh)) \cdot W +\end{equation*} + \subsection{Write down $\frac{\partial L}{\partial W}$ as expanded sum for $T = 3$. Use the chain rule to show that we will need to multiply $T - 1$ matrices of the form $(\text{diag}(\sigma')W)$.} \[ \frac{\partial L}{\partial W} = \frac{\partial L_1}{\partial h_1} \frac{\partial h_1}{\partial h_1} \frac{\partial h_1}{\partial W} + \frac{\partial L_2}{\partial h_2} \frac{\partial h_2}{\partial h_1} \frac{\partial h_1}{\partial W} + \frac{\partial L_3}{\partial h_3} \frac{\partial h_3}{\partial h_2} \frac{\partial h_2}{\partial h_1} \frac{\partial h_1}{\partial W} -- GitLab