0%

tensorflow(一)

深度学习原理

梯度下降

深度学习简单实现

最基础线性规划实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import tensorflow as tf
from tensorflow.keras import datasets
(x, y), (x_val, y_val) = datasets.mnist.load_data()
# 加载数据
x = tf.convert_to_tensor(x, dtype=tf.float32) / 255.
y = tf.convert_to_tensor(y, dtype=tf.int32)
print(x.shape, y.shape)
# 将x、y转为tensorflow的db
train_dataset = tf.data.Dataset.from_tensor_slices((x, y))
# 设置batch
train_dataset = train_dataset.batch(200)

# [b, 784] => [b, 256] => [b, 128] => [b, 10]
# []
# 这是在计算过程中需要的参数,也是梯度下降时要变化的东西
w1 = tf.Variable(tf.random.truncated_normal([784, 256], stddev=0.1))
b1 = tf.Variable(tf.zeros([256]))
w2 = tf.Variable(tf.random.truncated_normal([256, 128], stddev=0.1))
b2 = tf.Variable(tf.zeros([128]))
w3 = tf.Variable(tf.random.truncated_normal([128, 10], stddev=0.1))
b3 = tf.Variable(tf.zeros([10]))

lr = 1e-3

for step, (x, y) in enumerate(train_dataset):
# x:[128, 28, 28]
# y:[128]
# [b, 28, 28] => [b, 28*28]
x = tf.reshape(x, [-1, 28*28])

with tf.GradientTape() as tape:
# x:[b, 28*28]
# h1 = x@w1 + b1
# [b, 784]@[784, 256] + [256]=>[b,256] + [256] => [b, 256]
h1 = x@w1 + b1
h1 = tf.nn.relu(h1)
# [b, 256] => [b, 128]
h2 = h1@w2 + b2
h2 = tf.nn.relu(h2)
# [b, 128] => [b, 10]
out = h2@w3 + b3

# compute loss
# out: [b, 10]
# y:[b] => [b, 10]
y = tf.one_hot(y, depth=10)

# mse = mean(sum(y-out)^2)
# [b,10]
loss = tf.square(y-out)
# mean: scalar
loss = tf.reduce_mean(loss)
# 梯度计算
grads = tape.gradient(loss, [w1, b1, w2, b2, w3, b3])
# 梯度更新
# w1 = w1 - lr * grads[0]
w1.assign_sub(lr * grads[0])
b1.assign_sub(lr * grads[1])
w2.assign_sub(lr * grads[2])
b2.assign_sub(lr * grads[3])
w3.assign_sub(lr * grads[4])
b3.assign_sub(lr * grads[5])

if step % 100 == 0:
print(step, 'loss', float(loss))

使用tensorflow的部分接口实现

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, optimizers, Sequential, layers


def data_pre_process(x, y):
x = tf.cast(x, dtype=tf.float32) / 255.
y = tf.cast(y, dtype=tf.int32)

return x, y


def dataLoader():
(x, y), (x_val, y_val) = datasets.fashion_mnist.load_data()
db = tf.data.Dataset.from_tensor_slices((x, y))
db = db.map(data_pre_process).batch(128)

db_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
db_val = db_val.map(data_pre_process).batch(128)

return db, db_val


if __name__ == "__main__":
# 数据加载
db, db_val = dataLoader()

model = Sequential([
layers.Dense(256, activation=tf.nn.relu),
layers.Dense(128, activation=tf.nn.relu),
layers.Dense(64, activation=tf.nn.relu),
layers.Dense(32, activation=tf.nn.relu),
layers.Dense(10, activation=tf.nn.relu),
])
model.build(input_shape=[None, 28*28])
model.summary()

optimizers = optimizers.Adam(lr=1e-3)
# 迭代
for epoch in range(30):
# 每次迭代过程中会根据batchsize分块计算
for step, (x, y) in enumerate(db):
x = tf.reshape(x, [-1, 28*28])
# 梯度计算
with tf.GradientTape() as tape:
logits = model(x)
y_onehot = tf.one_hot(y, depth=10)
loss_mse = tf.reduce_mean(tf.losses.MSE(y_onehot, logits))
loss_ce = tf.reduce_mean(tf.losses.categorical_crossentropy(y_onehot, logits, from_logits=True))

grads = tape.gradient(loss_ce, model.trainable_variables)
# 梯度更新
optimizers.apply_gradients(zip(grads, model.trainable_variables))

if step % 100 == 0:
print(epoch, step, 'loss:', float(loss_ce), float(loss_mse))
# 测试/验证集精度计算
total_correct = 0
total_num = 0
for x, y in db_val:
x = tf.reshape(x, [-1, 28*28])
logits = model(x)
prob = tf.nn.softmax(logits, axis=1)

pred = tf.argmax(prob, axis=1)
pred = tf.cast(pred, dtype=tf.int32)

correct = tf.equal(pred, y)
correct = tf.reduce_sum(tf.cast(correct, dtype=tf.int32))
total_correct += int(correct)
total_num += x.shape[0]

acc = total_correct/total_num
print(epoch, 'test acc:', float(acc))

tensorflow高级api使用

所有的深度学习都包含上面几个步骤

  • 加载数据
  • 迭代
  • 梯度更新
  • 测试/验证精度计算

因此,tensorflow将上面的固定步骤写在同一的接口中方便使用

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import datasets, optimizers, Sequential, layers


def data_pre_process(x, y):
x = tf.cast(x, dtype=tf.float32) / 255.
y = tf.cast(y, dtype=tf.int32)

return x, y


def dataLoader():
(x, y), (x_val, y_val) = datasets.fashion_mnist.load_data()
x = x.reshape(60000, 28*28)
y = tf.one_hot(y, depth=10)
db = tf.data.Dataset.from_tensor_slices((x, y))
db = db.map(data_pre_process).batch(128)

db_val = tf.data.Dataset.from_tensor_slices((x_val, y_val))
db_val = db_val.map(data_pre_process).batch(128)

return db, db_val


if __name__ == "__main__":
db, db_val = dataLoader()

model = Sequential([
layers.Dense(256, activation=tf.nn.relu),
layers.Dense(128, activation=tf.nn.relu),
layers.Dense(64, activation=tf.nn.relu),
layers.Dense(32, activation=tf.nn.relu),
layers.Dense(10, activation=tf.nn.relu),
])
model.build(input_shape=[None, 28*28])
model.summary()
# compile中会提供若干参数,如梯度优化方式、误差计算方式、精度评定方式,当然还有更多参数,请阅读文档
model.compile(optimizer=optimizers.Adam(lr=0.0001), loss=tf.losses.CategoricalCrossentropy(from_logits=True),
metrics=['accuracy'])
# fit即开始训练,需要输入db和要迭代的次数,注意db中的x和y要和model的输入与输出维度一致
model.fit(db, epochs=1000)

常见问题

过拟合和欠拟合

过拟合:训练精度很高,测试精度不够(可能)

欠拟合:训练精度不够,测试精度也不够(可能)——模型复杂度不够

过拟合解决

  • 更多数据
  • 更简单的模型
  • 数据增强

Regularization

image-20230702195941412

前面是误差,后面是参数的范式,参数的一范式越小说明参数越接近于0,那么拟合出来的模型就越平滑,出现过拟合的可能性就越小

image-20230702200316610

两张regularization方式,分别是一范数二范数

Momentum

image-20230702200957049

动量设置,参数更改的不仅仅由当前梯度的影像,还与上一次梯度的方向有关

Learning rate

image-20230702201327476

学习率一般刚开始比较大,后续慢慢变小,前期学习率大变化快,后续会较慢

提前取消

如果出现了训练精度还在提高,测试精度不提高了,说明已经过拟合,可以停止训练

Dropout

image-20230703092847561

每一次训练都有一些连线可能中断

tensorflow和pytorch的dropout参数是相反的

dropout在做test时不能使用——要手动取消

为什么要卷积

为什么要卷积?不使用简单的Dense层?

  • 数据存储需求大

视野?滑动窗口?这个窗口是卷积核?cv里面锐化模糊边缘提取的卷积核

卷积核的个数,也就是con2d中的两个参数:

  • 卷积核的大小问题

[c, 3, 3]一个卷积核可以将一张图片卷积到一个channels为1的新层次,c为图片的通道数

image-20230703095053915

如图,这个[3, 3, 3]依次对三个通道做乘法,然后将结果加起来得到一个一通道的数据,这个一通道的数据代表着原图像在某一层次上的特征

有时候我们需要更多的特征,这时候就需要[N, c, 3, 3]这样N个卷积来提取特征,就会得到N个通道的特征

如下面的(64, 3)其中的64就是上面的N代表着64个通道的特征,3是卷积核的大小,也就是[c, 3, 3]中的3,c会默认与图片的通道数相同,所以不需要额外设置

1
conv1 = Conv2D(64, 3, activation='relu', padding='same')(inputs)

image-20230702203217415

为什么要下采样(池化)

不同的程度可以获取到不同层级的特征

image-20230703095836625