深度学习笔记(2):参数范数惩罚L1和L2

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

自定义正则L1,L2类

# -*- coding: utf-8 -*- 
# author: 
# time : 2021/4/28 13:04
# task:  自定义的正则类。
""" 导入包"""
from abc import ABC, abstractmethod
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import math
import sys
import os
import time
import re
import progressbar
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScalerclass RegularizerBase(ABC):""" 基础类,init, loss, grad 两种方法,分别在反向传播的时候使用grad 进行使用,在计算loss的进行统计正则贡献。"""def __init__(self):super().__init__()@abstractmethoddef loss(self, **kwargs):raise NotImplementedError@abstractmethoddef grad(self, **kwargs):raise NotImplementedError""" 正则l1,l2 类,都是集成基础类"""
class L1_Regularizer(RegularizerBase):"""L1 方法进行正则化,公式: j = j + lambd * |w|所以参数是lambd"""def __init__(self, lambd = 0.001):super(L1_Regularizer).__init__()self.lambd = lambd# 实现lossdef loss(self, params):loss = 0pattern = re.compile(r"^W\d+") # 形状如W1,W2,W3 ...这种,不对b进行正则。for key, val in params.items():if pattern.match(key):loss +=  np.sum(np.abs(val)) * self.lambdreturn loss# 实现graddef grad(self, W): # formular : lambd * sign(w)grad = self.lambd * np.sign(W)return gradclass L2_Regularizer(RegularizerBase):"""L2 正则化,formular: j = j + 0.5* (w).t * lambd * (w)"""def __init__(self, lambd):super().__init__()self.lambd = lambddef loss(self, params):loss = 0pattern = re.compile(r"W\d+")for key, val in params.items():if pattern.match(key):loss += 0.5 * np.sum(np.square(val)) * self.lambd# np.sqrt()  求开方, np.square 求平方。# loss += 0.5 * np.sum(val**2) * self.lambdreturn lossdef grad(self, W):grad = self.lambd * Wreturn gradclass RegularizerInitializer(object):""" """def __init__(self, regular_name = "l2"):self.regular_name = regular_namedef __call__(self):r = r"([a-zA-Z]*)=([^,)]*)"regular_str = self.regular_name.lower()  # l2=0.001# 提取参数。kwargs = dict([(i,eval(j)) for (i,j) in re.findall(r, regular_str)])if "l1" in regular_str:regular = L1_Regularizer(**kwargs)elif "l2" in regular_str:regular = L2_Regularizer(**kwargs)else:raise ValueError(f"Unrecognized regular: {regular_str}")return regular #返回一个类对象。

构建正则全连接

from abc import ABC, abstractmethod
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt
import math
import sys
import os
import time
import re
import progressbar
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler# 从之前的文件中导入需要的类:
from utils import *
from activation import *# 导入regular类
from Regular_Class import *class FullyConnected_Regular(LayerBase):""" 需要一些参数,比如in,out....forward, backward, update, flash_gradient, _init_params"""def __init__(self, n_out,acti_fn,init_w,optimizer=None):super(FullyConnected_Regular,self).__init__(optimizer)  # 将optimizer 传到lyaerbase 中进行初始化。self.n_out = n_outself.acti_fn = ActivationInitializer(acti_fn)()self.init_w = init_wself.init_weights = WeightInitializer(mode=init_w)self.n_in = Noneself.is_initialized = Falsedef _init_params(self,**kwargs):b = np.zeros((1, self.n_out))W = self.init_weights((self.n_in, self.n_out))self.params = {"W" : W, "b": b}self.gradients = {"W" : np.zeros_like(W), "b": np.zeros_like(b)}self.is_initialized = Truedef forward(self,X, retain_derived=True):""" 全连接网络的前向传播"""if not self.is_initialized:self.n_in = X.shape[1]self._init_params()W = self.params["W"]b = self.params["b"]z = X@W + ba = self.acti_fn.forward(z)# 保存输入,方便进行反向传播计算。if retain_derived:self.X.append(X)return adef backward(self, dLda, retain=True, regular = None):""" 有正则项的反向传播"""if not isinstance(dLda, list):dLda = [dLda]dX = []X = self.X  # X shape: nbatchsize,inchannel.      dlda shape: batchsize, outchannel.for da, x  in zip(dLda,X):dx,dw,db = self._bwd(da,x,regular)dX.append(dx)if retain:self.gradients["W"] += dwself.gradients["b"] += dbreturn dX[0] if len(X)==1 else dXdef _bwd(self,dLda,X,regular):W = self.params["W"]b = self.params["b"]z = X@W +bdz = dLda * self.acti_fn.grad(z)   # a = acti_fn(z)dX = dz @ W.TdW = X.T @ dzdb = dz.sum(axis=0, keepdims=True)# 如果有正则项if regular is not None:# n = X.shape[0]dW_norm = regular.grad(W)dW += dW_normreturn dX,dW,dbdef hyperparams(self):return {"layer":"Fully_connected_Regularizer","init_w":self.init_w,"n_in": self.n_in,"n_out": self.n_out,"acti_fn":str(self.acti_fn),"optimizer":{"hyperparams": self.optimizer.hyperparams,},"componets":{k:v for k,v in self.params.items()}}
双层全连接正则模型

def minibatch(x, batchsize=256, shuffle=True):N = x.shape[0]idx = np.arange(N)n_batches = int(np.ceil(N/batchsize))if shuffle:np.random.shuffle(idx)def mb_generator():for i in range(n_batches):yield idx[i*batchsize:(i+1)*batchsize]return mb_generator(), n_batches  # 返回迭代器(索引), 以及batch数量。""" 双层全连接层的模型"""
class DFN(object):def __init__(self,hidden_dims_1 = None,hidden_dims_2 = None,optimizer = "sgd(lr=0.1)",init_w = "std_normal",regular_act=None,loss = CrossEntropy()):self.optimizer = optimizerself.hidden_dims_1 = hidden_dims_1self.hidden_dims_2 = hidden_dims_2self.loss =lossself.regular = Noneself.regular_act = regular_actself.is_initialized = Falseself.init_w = init_wdef _set_params(self):""" 模型初始化: FC1-> sigmoid -> FC2 -> softmax"""self.layers = OrderedDict()self.layers["FC1"] = FullyConnected_Regular(n_out=self.hidden_dims_1,acti_fn="sigmoid",init_w=self.init_w,optimizer=self.optimizer)self.layers["FC2"] = FullyConnected_Regular(n_out=self.hidden_dims_2,acti_fn="affine(slope=1.,intercept=0)", #slope=(.*),intercept=(.*)init_w=self.init_w,optimizer=self.optimizer)self.layers["Softmax"] = Softmax(optimizer=self.optimizer)   # 需要改变参数数量加上一个regularif self.regular_act is not None:self.regular = RegularizerInitializer(self.regular_act)()  # 类对象。self.is_initialized = Truedef forward(self,X):Xs = {}out = Xfor k,v in self.layers.items():Xs[k] = outout = v.forward(out)return out, Xsdef backward(self,grad):dXs = {}out = gradfor k,v in reversed(self.layers.items()):dXs[k] = outout = v.backward(out, regular=self.regular)return out, dXsdef update(self):""" 参数更新"""for k,v in reversed(list(self.layers.items())):v.update()self.flush_gradients()def flush_gradients(self, curr_loss=None):for k,v in self.layers.items():v.flush_gradients()def fit(self, X_train,y_train,n_epochs=20, batch_size=64, verbose=False):""":param X_train::param y_train::param n_epochs::param batch_size::param verbose::return:"""self.verbose = verboseself.n_epochs = n_epochsself.batch_size = batch_sizeif not self.is_initialized:self.n_features = X_train.shape[1]self._set_params()prev_loss = np.inf# softmax = Softmax()for i in range(n_epochs):loss, epoch_start = 0.0, time.time()batch_generator, n_batch = minibatch(X_train, self.batch_size, shuffle=True)# batch_generator is index of xtrian.for j, batch_idx in enumerate(batch_generator):batch_len, batch_start = len(batch_idx), time.time()X_batch, y_batch = X_train[batch_idx], y_train[batch_idx]out,_  = self.forward(X_batch) # 前向传播。# y_pred_batch = softmax(out)batch_loss = self.loss(y_batch, out)if self.regular is not None:for _,layerparams in self.hyperparams["components"].items():assert type(layerparams) is dictbatch_loss += self.regular.loss(layerparams)grad = self.loss.grad(y_batch, out)_,_ = self.backward(grad) # 反向传播计算梯度。self.update()loss += batch_lossif self.verbose:fstr = f"\t [Batch {j+1}/{n_batch} Train loss :{batch_loss:.3f} ({(time.time() - batch_start):.1f}s/batch) ]"print(fstr)loss /= n_batchfstr2 = f"[Epoch {i+1}/{n_epochs} avg.loss :{loss:.3f}, Delta:{(prev_loss-loss):.3f} ({(time.time() - epoch_start):.1f}s/epoch)]"print(fstr2)prev_loss = lossdef evaluate(self, X_test, y_test, batch_size=128):acc = 0.0batch_generator, n_batch = minibatch(X_test, batchsize=batch_size, shuffle=True)for j, batch_idx in enumerate(batch_generator):batch_len, batch_start = len(batch_idx), time.time()X_batch, y_batch = X_test[batch_idx], y_test[batch_idx]out,_ = self.forward(X_batch)y_pred = np.argmax(out, axis=1) #取索引y_batch = np.argmax(y_batch,axis=1)  # onehot ---> max index.acc += np.sum(y_pred==y_batch)return acc / X_test.shape[0]@propertydef hyperparams(self):return {"init_w": self.init_w,"loss": str(self.loss),"optimizer": self.optimizer,"regular": str(self.regular_act),"hidden_dims_1": self.hidden_dims_1,"hidden_dims_2": self.hidden_dims_2,"components": {k: v.params for k, v in self.layers.items()}}
训练实验:
""" 测试训练"""
def load_data(path = "..\data/mnist/mnist.npz"):f = np.load(path)X_train,y_train = f["x_train"], f["y_train"]X_test, y_test = f["x_test"], f["y_test"]f.close()return (X_train,y_train),(X_test,y_test)(X_train, y_train), (X_test, y_test) = load_data()
y_train = np.eye(10)[y_train.astype(int)]
y_test = np.eye(10)[y_test.astype(int)]
X_train = X_train.reshape(-1, X_train.shape[1]*X_train.shape[2]).astype('float32')
X_test = X_test.reshape(-1, X_test.shape[1]*X_test.shape[2]).astype('float32')print(X_train.shape, y_train.shape)
N = 20000 # 取 20000 条数据用以训练
indices = np.random.permutation(range(X_train.shape[0]))[:N]
X_train, y_train = X_train[indices], y_train[indices]
print(X_train.shape, y_train.shape)
X_train /= 255
X_train = (X_train - 0.5) * 2
X_test /= 255
X_test = (X_test - 0.5) * 2# 不引入正则化
model = DFN(hidden_dims_1=200, hidden_dims_2=10)
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("without regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))
(60000, 784) (60000, 10)
(20000, 784) (20000, 10)
[Epoch 1/20 avg.loss :2.284, Delta:inf (1.9s/epoch)]
[Epoch 2/20 avg.loss :2.181, Delta:0.103 (1.9s/epoch)]
[Epoch 3/20 avg.loss :1.827, Delta:0.354 (1.9s/epoch)]
[Epoch 4/20 avg.loss :1.338, Delta:0.489 (1.8s/epoch)]
[Epoch 5/20 avg.loss :0.935, Delta:0.403 (1.9s/epoch)]
[Epoch 6/20 avg.loss :0.704, Delta:0.231 (1.9s/epoch)]
[Epoch 7/20 avg.loss :0.578, Delta:0.126 (1.9s/epoch)]
[Epoch 8/20 avg.loss :0.509, Delta:0.070 (1.9s/epoch)]
[Epoch 9/20 avg.loss :0.464, Delta:0.045 (1.9s/epoch)]
[Epoch 10/20 avg.loss :0.434, Delta:0.030 (1.9s/epoch)]
[Epoch 11/20 avg.loss :0.411, Delta:0.023 (1.9s/epoch)]
[Epoch 12/20 avg.loss :0.393, Delta:0.018 (1.9s/epoch)]
[Epoch 13/20 avg.loss :0.380, Delta:0.014 (1.9s/epoch)]
[Epoch 14/20 avg.loss :0.368, Delta:0.011 (1.9s/epoch)]
[Epoch 15/20 avg.loss :0.357, Delta:0.011 (1.9s/epoch)]
[Epoch 16/20 avg.loss :0.348, Delta:0.009 (1.9s/epoch)]
[Epoch 17/20 avg.loss :0.341, Delta:0.008 (1.9s/epoch)]
[Epoch 18/20 avg.loss :0.335, Delta:0.006 (1.9s/epoch)]
[Epoch 19/20 avg.loss :0.328, Delta:0.007 (1.9s/epoch)]
[Epoch 20/20 avg.loss :0.322, Delta:0.006 (1.9s/epoch)]
without regularization -- accuracy:0.9188

引入正则:

model = DFN(hidden_dims_1=200, hidden_dims_2=10, regular_act="l2(lambd=0.01)")
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("with L2 regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))(60000, 784) (60000, 10)
(20000, 784) (20000, 10)
[Epoch 1/20 avg.loss :2.290, Delta:inf (2.0s/epoch)]
[Epoch 2/20 avg.loss :2.259, Delta:0.031 (2.0s/epoch)]
[Epoch 3/20 avg.loss :2.173, Delta:0.086 (2.0s/epoch)]
[Epoch 4/20 avg.loss :1.971, Delta:0.201 (2.1s/epoch)]
[Epoch 5/20 avg.loss :1.767, Delta:0.205 (2.0s/epoch)]
[Epoch 6/20 avg.loss :1.570, Delta:0.197 (2.0s/epoch)]
[Epoch 7/20 avg.loss :1.398, Delta:0.172 (2.0s/epoch)]
[Epoch 8/20 avg.loss :1.280, Delta:0.118 (2.4s/epoch)]
[Epoch 9/20 avg.loss :1.191, Delta:0.089 (2.1s/epoch)]
[Epoch 10/20 avg.loss :1.115, Delta:0.076 (2.1s/epoch)]
[Epoch 11/20 avg.loss :1.058, Delta:0.057 (2.2s/epoch)]
[Epoch 12/20 avg.loss :1.016, Delta:0.042 (2.3s/epoch)]
[Epoch 13/20 avg.loss :0.974, Delta:0.042 (2.2s/epoch)]
[Epoch 14/20 avg.loss :0.936, Delta:0.038 (2.1s/epoch)]
[Epoch 15/20 avg.loss :0.901, Delta:0.034 (2.1s/epoch)]
[Epoch 16/20 avg.loss :0.876, Delta:0.025 (2.0s/epoch)]
[Epoch 17/20 avg.loss :0.857, Delta:0.019 (2.1s/epoch)]
[Epoch 18/20 avg.loss :0.844, Delta:0.013 (2.1s/epoch)]
[Epoch 19/20 avg.loss :0.834, Delta:0.010 (2.1s/epoch)]
[Epoch 20/20 avg.loss :0.826, Delta:0.009 (2.0s/epoch)]
with L2 regularization -- accuracy:0.8514Process finished with exit code 0

L1正则:

model = DFN(hidden_dims_1=200, hidden_dims_2=10, regular_act="l1(lambd=0.0001)")
model.fit(X_train, y_train, n_epochs=20, batch_size=64)
print("with L1 regularization -- accuracy:{}".format(model.evaluate(X_test, y_test)))[Epoch 1/20 avg.loss :2.288, Delta:inf (2.4s/epoch)]
[Epoch 2/20 avg.loss :2.261, Delta:0.027 (2.3s/epoch)]
[Epoch 3/20 avg.loss :2.197, Delta:0.063 (2.3s/epoch)]
[Epoch 4/20 avg.loss :2.002, Delta:0.195 (2.3s/epoch)]
[Epoch 5/20 avg.loss :1.750, Delta:0.251 (2.3s/epoch)]
[Epoch 6/20 avg.loss :1.494, Delta:0.256 (2.3s/epoch)]
[Epoch 7/20 avg.loss :1.222, Delta:0.272 (2.3s/epoch)]
[Epoch 8/20 avg.loss :1.002, Delta:0.219 (2.3s/epoch)]
[Epoch 9/20 avg.loss :0.848, Delta:0.155 (2.3s/epoch)]
[Epoch 10/20 avg.loss :0.729, Delta:0.118 (2.3s/epoch)]
[Epoch 11/20 avg.loss :0.641, Delta:0.088 (2.3s/epoch)]
[Epoch 12/20 avg.loss :0.582, Delta:0.059 (2.3s/epoch)]
[Epoch 13/20 avg.loss :0.541, Delta:0.041 (2.3s/epoch)]
[Epoch 14/20 avg.loss :0.512, Delta:0.029 (2.3s/epoch)]
[Epoch 15/20 avg.loss :0.491, Delta:0.021 (2.3s/epoch)]
[Epoch 16/20 avg.loss :0.475, Delta:0.016 (2.3s/epoch)]
[Epoch 17/20 avg.loss :0.462, Delta:0.013 (2.3s/epoch)]
[Epoch 18/20 avg.loss :0.452, Delta:0.010 (2.3s/epoch)]
[Epoch 19/20 avg.loss :0.442, Delta:0.010 (2.3s/epoch)]
[Epoch 20/20 avg.loss :0.435, Delta:0.007 (2.3s/epoch)]
with L2 regularization -- accuracy:0.8947

可以看出,L1的lambd选择要比L2更加谨慎,因为会稀疏化权重矩阵,如果搞成0了,那么参数就不会更新了。


本文来自互联网用户投稿,文章观点仅代表作者本人,不代表本站立场,不承担相关法律责任。如若转载,请注明出处。 如若内容造成侵权/违法违规/事实不符,请点击【内容举报】进行投诉反馈!

相关文章

立即
投稿

微信公众账号

微信扫一扫加关注

返回
顶部