From bdb94577d97da5cf5b6ec046952dbe79e9c886bf Mon Sep 17 00:00:00 2001
From: Alican Bozkurt <alican@ece.neu.edu>
Date: Tue, 28 Jun 2016 16:28:33 -0400
Subject: [PATCH 1/2] add default value for rms_decay

---
 src/caffe/proto/caffe.proto | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 1556781cbc2..6940a705eb6 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -219,7 +219,7 @@ message SolverParameter {
 
   // RMSProp decay value
   // MeanSquare(t) = rms_decay*MeanSquare(t-1) + (1-rms_decay)*SquareGradient(t)
-  optional float rms_decay = 38;
+  optional float rms_decay = 38 [default = 0.99];
 
   // If true, print information about the state of the net that may help with
   // debugging learning problems.

From 80f60dae071fca4457d7a439960385a4579f489d Mon Sep 17 00:00:00 2001
From: Alican Bozkurt <alican@ece.neu.edu>
Date: Tue, 28 Jun 2016 16:59:36 -0400
Subject: [PATCH 2/2] corrected rmsprop documentation

---
 docs/tutorial/solver.md | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/docs/tutorial/solver.md b/docs/tutorial/solver.md
index b719f715a4b..81c626386a2 100644
--- a/docs/tutorial/solver.md
+++ b/docs/tutorial/solver.md
@@ -209,18 +209,11 @@ What distinguishes the method from SGD is the weight setting $$ W $$ on which we
 The **RMSprop** (`type: "RMSProp"`), suggested by Tieleman in a Coursera course lecture, is a gradient-based optimization method (like SGD). The update formulas are
 
 $$
-(v_t)_i =
-\begin{cases}
-(v_{t-1})_i + \delta, &(\nabla L(W_t))_i(\nabla L(W_{t-1}))_i > 0\\
-(v_{t-1})_i \cdot (1-\delta), & \text{else}
-\end{cases}
+\operatorname{MS}((W_t)_i)= \delta\operatorname{MS}((W_{t-1})_i)+ (1-\delta)(\nabla L(W_t))_i^2 \\
+(W_{t+1})_i= (W_{t})_i -\alpha\frac{(\nabla L(W_t))_i}{\sqrt{\operatorname{MS}((W_t)_i)}}
 $$
 
-$$
-(W_{t+1})_i =(W_t)_i - \alpha (v_t)_i,
-$$
-
-If the gradient updates results in oscillations the gradient is reduced by times $$1-\delta$$. Otherwise it will be increased by $$\delta$$. The default value of $$\delta$$ (`rms_decay`) is set to $$\delta = 0.02$$.
+The default value of $$\delta$$ (`rms_decay`) is set to $$\delta=0.99$$.
 
 [1] T. Tieleman, and G. Hinton.
     [RMSProp: Divide the gradient by a running average of its recent magnitude](http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).