TensorFlow2.x 学习笔记(六)随机梯度下降以及数据可视化
文章目录
梯度下降
简介
梯度
梯度是一个向量,表示某一函数在该点处的方向导数沿着该方向取得最大值。
  
     
      
       
       
         g 
        
       
         r 
        
       
         a 
        
       
         d 
        
       
         f 
        
       
         ( 
        
       
         x 
        
       
         , 
        
       
         y 
        
       
         ) 
        
       
         = 
        
       
         ∇ 
        
        
        
          f 
         
        
          ( 
         
        
          x 
         
        
          , 
         
        
          y 
         
        
          ) 
         
        
       
         = 
        
       
         ( 
        
        
         
         
           ∂ 
          
         
           f 
          
         
         
         
           ∂ 
          
         
           x 
          
         
        
       
         , 
        
        
         
         
           ∂ 
          
         
           f 
          
         
         
         
           ∂ 
          
         
           y 
          
         
        
       
         ) 
        
       
         = 
        
        
         
         
           ∂ 
          
         
           f 
          
         
         
         
           ∂ 
          
         
           x 
          
         
        
       
         i 
        
       
         + 
        
        
         
         
           ∂ 
          
         
           f 
          
         
         
         
           ∂ 
          
         
           x 
          
         
        
       
         j 
        
       
      
        gradf(x,y) = \nabla{f(x,y)} = (\frac{\partial f}{\partial x},\frac{\partial f}{\partial y}) = \frac{\partial f}{\partial x}i + \frac{\partial f}{\partial x}j 
       
      
    gradf(x,y)=∇f(x,y)=(∂x∂f,∂y∂f)=∂x∂fi+∂x∂fj
利用梯度优化
梯度方向是函数增长最快的方向,因此搜索函数最小值的过程就是不断向负梯度方向移动的过程
  
     
      
       
        
        
          θ 
         
         
         
           t 
          
         
           + 
          
         
           1 
          
         
        
       
         = 
        
        
        
          θ 
         
        
          t 
         
        
       
         − 
        
        
        
          α 
         
        
          t 
         
        
       
         ∇ 
        
        
        
          f 
         
        
          ( 
         
         
         
           θ 
          
         
           t 
          
         
        
          ) 
         
        
       
      
        \theta_{t+1}= \theta_t - \alpha_t\nabla{f(\theta_t)} 
       
      
    θt+1=θt−αt∇f(θt)
AutoGrad with Tensorflow
GradientTape
- with tf.GradientTape() as tape: 
  - Build computation graph
- l o s s = f θ ( x ) loss = f_\theta(x) loss=fθ(x)
 
- [w_grad] = tape.gradient(loss,[w])
w = tf.constant(1.)
b = tf.constant(2.)
x = tf.constant(3.)
y = w*x
with tf.GradientTape() as tape:
    tape.watch([w])
    y2 = w * x
grad1 = tape.gradient(y, [w])
print(grad1)#[None]
grad2 = tape.gradient(y2, [w])#non-persistent error
with tf.GradientTape() as tape:
    tape.watch([w])
    y2 = w * x
grad2 = tape.gradient(y2, [w])
print(grad2)#2
Persistent GradientTape
non-persistent 只能调用一次,用完就会释放显存,可以开启persistent选项来解决这个问题,用完以后记得手动释放
with tf.GradientTape(persistent=True) as tape:
    tape.watch([w])
    y = w * x
grad1 = tape.gradient(y, [w])
print(grad1)
grad1 = tape.gradient(y, [w])
print(grad1)
del tape
grad1 = tape.gradient(y, [w])
print(grad1)
2 n d − o r d e r 2^{nd}-order 2nd−order
w = tf.Variable(1.0)
b = tf.Variable(2.0)
x = tf.Variable(3.0)
with tf.GradientTape() as t1:
    with tf.GradientTape() as t2:
        y = w * x * x + w * b
    dx,db = t2.gradient(y, [x, b])
    print(dx,db)
dx2 = t1.gradient(dx, [x])
print(dx2)
激活函数及其梯度
Sigmoid/Logistic
- f ( x ) = σ ( x ) = 1 1 + e − x f(x) = \sigma(x) = \frac{1}{1+e^{-x}} f(x)=σ(x)=1+e−x1
- f ( x ) ∈ ( 0 , 1 ) f(x) \in (0,1) f(x)∈(0,1)
- d d x σ ( x ) = d d x ( 1 1 + e − x ) = σ ( x ) − σ ( x ) 2 \frac{d}{dx}\sigma(x) = \frac{d}{dx}(\frac{1}{1+e^{-x}}) = \sigma(x) -\sigma(x)^2 dxdσ(x)=dxd(1+e−x1)=σ(x)−σ(x)2
- 优点:光滑,取值在(0,1)之间
- 缺点:在远处梯度很小
a = tf.linspace(-10., 10., 10)
with tf.GradientTape() as tape:
    tape.watch(a)
    y = tf.sigmoid(a)
da = tape.gradient(y, [a])
print(a)
print(y)
print(da)
Tanh
- f ( x ) = t a n h ( x ) = ( e x − e − x ) ( e x + e − x ) = 2 σ ( 2 x ) − 1 f(x) = tanh(x) = \frac{(e^x - e^{-x})}{(e^x + e^{-x})} = 2\sigma(2x) - 1 f(x)=tanh(x)=(ex+e−x)(ex−e−x)=2σ(2x)−1
- f ( x ) ∈ ( − 1 , 1 ) f(x) \in (-1,1) f(x)∈(−1,1)
- d d x t a n h ( x ) = d d x ( e x − e − x ) ( e x + e − x ) = 1 − t a n h 2 ( x ) \frac{d}{dx}tanh(x) = \frac{d}{dx}\frac{(e^x - e^{-x})}{(e^x + e^{-x})} = 1 - tanh^2(x) dxdtanh(x)=dxd(ex+e−x)(ex−e−x)=1−tanh2(x)
tf.tanh(a)
ReLU
 
     
      
       
       
         f 
        
       
         ( 
        
       
         x 
        
       
         ) 
        
       
         = 
        
        
        
          { 
         
         
          
           
            
            
              0 
             
            
           
           
            
             
             
               f 
              
             
               o 
              
             
               r 
              
             
            
           
           
            
             
             
               x 
              
             
               < 
              
             
               0 
              
             
            
           
          
          
           
            
            
              x 
             
            
           
           
            
             
             
               f 
              
             
               o 
              
             
               r 
              
             
            
           
           
            
             
             
               x 
              
             
               > 
              
             
               0 
              
             
            
           
          
         
        
       
      
        f(x) = \left\{ \begin{array}{rcl} 0 & \mathrm{for} & x<0 \\ x & \mathrm{for} & x>0 \\ \end{array}\right. 
       
      
    f(x)={0xforforx<0x>0
  
     
      
       
        
        
          f 
         
        
          ′ 
         
        
       
         ( 
        
       
         x 
        
       
         ) 
        
       
         = 
        
        
        
          { 
         
         
          
           
            
            
              0 
             
            
           
           
            
             
             
               f 
              
             
               o 
              
             
               r 
              
             
            
           
           
            
             
             
               x 
              
             
               < 
              
             
               0 
              
             
            
           
          
          
           
            
            
              1 
             
            
           
           
            
             
             
               f 
              
             
               o 
              
             
               r 
              
             
            
           
           
            
             
             
               x 
              
             
               ≥ 
              
             
               0 
              
             
            
           
          
         
        
       
      
        f'(x) = \left\{ \begin{array}{rcl} 0 & \mathrm{for} & x<0 \\ 1 & \mathrm{for} & x\geq0 \\ \end{array}\right. 
       
      
    f′(x)={01forforx<0x≥0
tf.nn.relu(x)
tf.nn.leaky_relu(x)#x<0时梯度为一个很小的正数
Loss及其梯度
- MSE(Mean Squared Error)
- Cross Entropy Loss
MSE
- l o s s = Σ [ y − ( x w + b ) ] 2 loss = \Sigma[y - (xw + b)]^2 loss=Σ[y−(xw+b)]2
- ∇ θ l o s s = 2 Σ [ y − f θ ( x ) ] ∗ ∇ θ f θ ( x ) \nabla_{\theta} loss = 2\Sigma[y - f_{\theta}(x)] * \nabla_{\theta} f_{\theta}(x) ∇θloss=2Σ[y−fθ(x)]∗∇θfθ(x)
Softmax
 
     
      
       
       
         S 
        
       
         ( 
        
        
        
          y 
         
        
          i 
         
        
       
         ) 
        
       
         = 
        
        
         
         
           e 
          
          
          
            y 
           
          
            i 
           
          
         
         
          
          
            ∑ 
           
          
            j 
           
          
          
          
            e 
           
           
           
             y 
            
           
             i 
            
           
          
         
        
       
      
        S(y_i)=\frac{e^{y_i}}{\sum\limits_{j}{e^{y_i}}} 
       
      
    S(yi)=j∑eyieyi
  
     
      
       
        
         
         
           ∂ 
          
          
          
            p 
           
          
            i 
           
          
         
         
         
           ∂ 
          
          
          
            a 
           
          
            j 
           
          
         
        
       
         = 
        
        
        
          { 
         
         
          
           
            
             
              
              
                p 
               
              
                i 
               
              
             
               ( 
              
             
               1 
              
             
               − 
              
              
              
                p 
               
              
                j 
               
              
             
               ) 
              
             
            
           
           
            
             
             
               i 
              
             
               f 
              
             
            
           
           
            
             
             
               i 
              
             
               = 
              
             
               j 
              
             
            
           
          
          
           
            
             
             
               − 
              
              
              
                p 
               
              
                i 
               
              
             
               ⋅ 
              
              
              
                p 
               
              
                j 
               
              
             
            
           
           
            
             
             
               i 
              
             
               f 
              
             
            
           
           
            
             
             
               i 
              
             
               ≠ 
              
             
               j 
              
             
            
           
          
         
        
       
      
        \frac{\partial{p_i}}{\partial{a_j}} = \left\{ \begin{array}{rcl} p_i(1-p_j) & \mathrm{if} & i = j \\ -p_i\cdot p_j & \mathrm{if} & i\neq j \\ \end{array}\right. 
       
      
    ∂aj∂pi={pi(1−pj)−pi⋅pjififi=ji=j
x = tf.random.normal([2, 4])
w = tf.random.normal([4, 3])
b = tf.zeros([3])
y = tf.constant([2, 0])
with tf.GradientTape() as tape:
    tape.watch([w,b])
    prob = tf.nn.softmax(x@w + b)
    loss = tf.reduce_mean(tf.keras.losses.MSE(tf.one_hot(y, depth = 3), prob))
grads = tape.gradient(loss, [w,b])
print(grads[0])
print(grads[1])
Crossentropy gradient
上一章已经写了,所以这里不介绍了
with tf.GradientTape() as tape:
    tape.watch([w,b])
    logits = x@w + b
    loss = tf.reduce_mean(tf.keras.losses.categorical_crossentropy(tf.one_hot(y, depth = 3), logits, from_logits=True))
grads = tape.gradient(loss, [w,b])
print(grads[0])
Chain Rule
- 利用链式法则进行反向传播
由于我只是学一下tf的使用,因此课程中关于单层和多层感知机的反向传播的推导就在这不赘述了,感兴趣的可以去看吴恩达的深度学习的网课
可视化
- tensorboard(tf)
- Visdom(pytorch)
tensorboard
- listen logdir
- build summary instance
- fed data into summary instance
# cd 到你的任务目录,0.0.0.0用以资磁remote,端口自定义防占用
tensorboard --logdir=./logs --host 0.0.0.0 --port=11021
create_file_writer()
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
log_dir = 'logs/' + current_time
summary_writer = tf.summary.create_file_writer(log_dir) 
image
#以mnist为例,不过不支持subplot,只能手写
sample_img = tf.reshape(sample_img, [1, 28, 28, 1])
with summary_writer.as_default():
    tf.summary.image("Training sample:", sample_img, step=0)
手写一个subplot
def plot_to_image(figure):
  """Converts the matplotlib plot specified by 'figure' to a PNG image and
  returns it. The supplied figure is closed and inaccessible after this call."""
  # Save the plot to a PNG in memory.
  buf = io.BytesIO()
  plt.savefig(buf, format='png')
  # Closing the figure prevents it from being displayed directly inside
  # the notebook.
  plt.close(figure)
  buf.seek(0)
  # Convert PNG buffer to TF image
  image = tf.image.decode_png(buf.getvalue(), channels=4)
  # Add the batch dimension
  image = tf.expand_dims(image, 0)
  return image
def image_grid(images):
  """Return a 5x5 grid of the MNIST images as a matplotlib figure."""
  # Create a figure to contain the plot.
  # https://morvanzhou.github.io/tutorials/data-manipulation/plt/4-1-subpot1/
  # 可以通过以上链接学习subplot
  figure = plt.figure(figsize=(10,10))
  for i in range(25):
    # Start next subplot.
    plt.subplot(5, 5, i + 1, title='name')
    plt.xticks([])
    plt.yticks([])
    plt.grid(False)
    plt.imshow(images[i], cmap=plt.cm.binary)
  
  return figure
val_images = x[:25]
val_images = tf.reshape(val_images, [-1, 28, 28, 1])
with summary_writer.as_default():
	val_images = tf.reshape(val_images, [-1, 28, 28])
	figure  = image_grid(val_images)
	tf.summary.image('val-images:', plot_to_image(figure), step=step)
scalar
with summary_writer.as_default(): 
    tf.summary.scalar('train-loss', float(loss), step=step) 
以上只是tensorboard的简单说明,可以查看TensorBoard 文档
这里记一个tensorflow的学习网站:
 简单粗暴tensorflow2