From e5e8913017788652262a4b30ec1a9836cf7b86a3 Mon Sep 17 00:00:00 2001
From: Caina Rose Paul <nyl55liz@rhrk.uni-kl.de>
Date: Sat, 9 Dec 2023 19:06:35 +0100
Subject: [PATCH] Update file exercise3.tex

---
 exercises/exercise3/exercise3.tex | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/exercises/exercise3/exercise3.tex b/exercises/exercise3/exercise3.tex
index c656c54..2d1b0a8 100644
--- a/exercises/exercise3/exercise3.tex
+++ b/exercises/exercise3/exercise3.tex
@@ -58,6 +58,20 @@ In the following, we want to derive the gradient of this loss function with resp
 \subsection{Given a function $f(h) = \sigma(Wh)$ where $h \in \mathbb{R}^d$ and \\ $W \in \mathbb{R}^{n \times d}$. Here the sigmoid function $\sigma$ is applied element-wise on a vector. Show that \\ $\frac{\partial f}{\partial h} = \text{diag}(\sigma'(Wh))W \in \mathbb{R}^{n \times d}$, where $\frac{\partial f}{\partial h}$ denotes the Jacobian matrix of $f$ with respect to $h$, and diag$(\sigma'(Wh))$ is the diagonal matrix of the vector $\sigma'(Wh)$.}
 
 \subsection{Write down $\frac{\partial L}{\partial W}$ as expanded sum for $T = 3$. Use the chain rule to show that we will need to multiply $T - 1$ matrices of the form $(\text{diag}(\sigma')W)$.}
+\[
+\frac{\partial L}{\partial W} = \frac{\partial L_1}{\partial h_1} \frac{\partial h_1}{\partial h_1} \frac{\partial h_1}{\partial W} + \frac{\partial L_2}{\partial h_2} \frac{\partial h_2}{\partial h_1} \frac{\partial h_1}{\partial W} + \frac{\partial L_3}{\partial h_3} \frac{\partial h_3}{\partial h_2} \frac{\partial h_2}{\partial h_1} \frac{\partial h_1}{\partial W}
+\]
+\[
+\text{ We know } \frac{\partial f}{\partial h} = \text{diag}(\sigma')W
+\]
+\begin{align}
+h_t & = \sigma(Wh_{t-1} + Ux_t) \\
+\frac{\partial h_t}{\partial h_{t-1}} & = \text{diag}(\sigma'(\mathbf{Wh}_{t-1} + \mathbf{Ux}_t))\mathbf{W}
+\end{align}
+
+when \( T = 3 \), we multiply \(\frac{\partial h_3}{\partial h_2} \frac{\partial h_2}{\partial h_1}\) \\
+So, two times \(\text{diag}(\sigma')W\) with itself multiplication .So for T=3 , T-1 time that is 2 times
+
 
 \subsection{Let $\text{diag}(\sigma')W = A =
 \begin{pmatrix}
-- 
GitLab