From ec4afb6d6b4359d2656cde03c36e688cb744eff9 Mon Sep 17 00:00:00 2001
From: Rea Fernandes <zob06qih@rhrk.uni-kl.de>
Date: Sun, 10 Dec 2023 00:16:58 +0100
Subject: [PATCH] Update exercise3.tex with 1.1 and 1.2

---
 exercises/exercise3/exercise3.tex | 39 +++++++++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/exercises/exercise3/exercise3.tex b/exercises/exercise3/exercise3.tex
index 2d1b0a8..8886813 100644
--- a/exercises/exercise3/exercise3.tex
+++ b/exercises/exercise3/exercise3.tex
@@ -55,8 +55,47 @@ In the following, we want to derive the gradient of this loss function with resp
 
 \subsection{Using the multivariate chain rule, show that \\ $\frac{\partial L}{\partial W} = \sum^T_{t=1}\sum^t_{k=1}\frac{\partial L_t}{\partial h_t}\frac{\partial h_t}{\partial h_k}\frac{\partial h_k}{\partial W}$.}
 
+Solution
+
+1. Start with the expression for $L$:
+\[
+L = \sum_{t=1}^{T} L_t
+\]
+
+2. Apply the chain rule for each $L_t$:
+\[
+\frac{\partial L_t}{\partial W} = \sum_{k=1}^{t} \frac{\partial L_t}{\partial h_t} \frac{\partial h_t}{\partial h_k} \frac{\partial h_k}{\partial W}
+\]
+
+3. Sum over all time steps $T$:
+\[
+\frac{\partial L}{\partial W} = \sum_{t=1}^{T} \frac{\partial L_t}{\partial W} = \sum_{t=1}^{T} \sum_{k=1}^{t} \frac{\partial L_t}{\partial h_t} \frac{\partial h_t}{\partial h_k} \frac{\partial h_k}{\partial W}
+\]
+
+Thus proved the derivation using the multivariate chain rule.
+
 \subsection{Given a function $f(h) = \sigma(Wh)$ where $h \in \mathbb{R}^d$ and \\ $W \in \mathbb{R}^{n \times d}$. Here the sigmoid function $\sigma$ is applied element-wise on a vector. Show that \\ $\frac{\partial f}{\partial h} = \text{diag}(\sigma'(Wh))W \in \mathbb{R}^{n \times d}$, where $\frac{\partial f}{\partial h}$ denotes the Jacobian matrix of $f$ with respect to $h$, and diag$(\sigma'(Wh))$ is the diagonal matrix of the vector $\sigma'(Wh)$.}
 
+Solution
+
+\begin{equation*}
+\begin{aligned}
+    f(h) &= \sigma(Wh) \\
+    \frac{\partial f}{\partial h} &= \text{diag}(\sigma'(Wh)) \cdot \frac{\partial}{\partial h}(\sigma(Wh)) \\
+    &= \text{diag}(\sigma'(Wh)) \cdot \sigma(Wh) \cdot (1 - \sigma(Wh)) \cdot W \\
+    &= \text{diag}(\sigma'(Wh)) \cdot \sigma(Wh) \cdot (\sigma(Wh) - 1) \cdot W \\
+    &= \text{diag}(\sigma'(Wh)) \cdot (\sigma(Wh) - \sigma(Wh)\sigma(Wh)) \cdot W \\
+    &= \text{diag}(\sigma'(Wh)) \cdot (\sigma(Wh) - \sigma(Wh)^2) \cdot W \\
+    &= \text{diag}(\sigma'(Wh)) \cdot \sigma(Wh) \cdot (1 - \sigma(Wh)) \cdot W \\
+    &= \text{diag}(\sigma'(Wh)) \cdot W
+\end{aligned}
+\end{equation*}
+
+Thus proved that 
+\begin{equation*}
+    \frac{\partial f}{\partial h} &= \text{diag}(\sigma'(Wh)) \cdot W
+\end{equation*}
+
 \subsection{Write down $\frac{\partial L}{\partial W}$ as expanded sum for $T = 3$. Use the chain rule to show that we will need to multiply $T - 1$ matrices of the form $(\text{diag}(\sigma')W)$.}
 \[
 \frac{\partial L}{\partial W} = \frac{\partial L_1}{\partial h_1} \frac{\partial h_1}{\partial h_1} \frac{\partial h_1}{\partial W} + \frac{\partial L_2}{\partial h_2} \frac{\partial h_2}{\partial h_1} \frac{\partial h_1}{\partial W} + \frac{\partial L_3}{\partial h_3} \frac{\partial h_3}{\partial h_2} \frac{\partial h_2}{\partial h_1} \frac{\partial h_1}{\partial W}
-- 
GitLab