使用tensorflow利用KNN算法对mnist数据集进行分类

KNN算法思想总结

在训练集中数据和标签已知的情况下,输入测试数据,将测试数据的特征与训练集中对应的特征进行相互比较,找到训练集中与之最为相似的前K个数据,则该测试数据对应的类别就是K个数据中出现次数最多的那个分类,其算法的描述为:

  1. 计算测试数据与各个训练数据之间的距离;
  2. 按照距离的递增关系进行排序;
  3. 选取距离最小的K个点;
  4. 确定前K个点所在类别的出现频率;
  5. 返回前K个点中出现频率最高的类别作为测试数据的预测分类。

加载mnist数据

1
2
3
4
5
6
import tensorflow as tf
import numpy as np
import random
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets('data/', one_hot=True)
Extracting data/train-images-idx3-ubyte.gz
Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz
1
2
print(mnist.train.images.shape)
print(mnist.test.images.shape)
(55000, 784)
(10000, 784)

设置属性

1
2
3
4
5
trainNum = 55000   # 训练图片总数
testNum = 10000 # 测试图片总数
trainSize = 5000 # 训练的时候使用的图片数量
testSize = 5 # 测试的时候使用的图片数量
k = 4 # 距离最小的K个图片

数据分解

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 生成不重复的随机数 
trainIndex = np.random.choice(trainNum,trainSize,replace=False)
testIndex = np.random.choice(testNum,testSize,replace=False)

# 生成训练数据
trainData = mnist.train.images[trainIndex]
trainLabel = mnist.train.labels[trainIndex]

# 生成测试数据
testData = mnist.test.images[testIndex]
testLabel = mnist.test.labels[testIndex]

print('trainData.shape=', trainData.shape)
print('trainLabel.shape=', trainLabel.shape)
print('testData.shape=', testData.shape)
print('testLabel.shape=', testLabel.shape)
print('testLabel=', testLabel)
trainData.shape= (5000, 784)
trainLabel.shape= (5000, 10)
testData.shape= (5, 784)
testLabel.shape= (5, 10)
testLabel= [[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

数据训练

1 设置变量

1
2
3
4
trainDataInput = tf.placeholder(shape=[None,784],dtype=tf.float32)
trainLabelInput = tf.placeholder(shape=[None,10],dtype=tf.float32)
testDataInput = tf.placeholder(shape=[None,784],dtype=tf.float32)
testLabelInput = tf.placeholder(shape=[None,10],dtype=tf.float32)

2 计算KNN距离,使用曼哈顿距离

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# expand_dim()来增加维度
f1 = tf.expand_dims(testDataInput,1)

# subtract()相减,得到一个三维数据
f2 = tf.subtract(trainDataInput,f1)

# tf.abs()求数据绝对值
# tf.reduce_sum()完成数据累加,把数据放到f3中
# f3 保存的是每张测试图片到所有训练的距离
f3 = tf.reduce_sum(tf.abs(f2),reduction_indices=2)

with tf.Session() as sess:
p1 = sess.run(f1,feed_dict={testDataInput:testData[0:5]})
print('p1=',p1.shape)

p2 = sess.run(f2,feed_dict={trainDataInput:trainData,testDataInput:testData[0:5]})
print('p2=',p2.shape)

p3 = sess.run(f3,feed_dict={trainDataInput:trainData,testDataInput:testData[0:5]})
print('p3=',p3.shape)
print('p3[0,0]=',p3[0,0])
p1= (5, 1, 784)
p2= (5, 5000, 784)
p3= (5, 5000)
p3[0,0]= 107.035324

3 选取距离最小的K个图片

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
# tf.negative(x,name=None),取负运算(f4 =-f3)
f4 = tf.negative(f3)

# f5,选取f4最大的四个值,即f3最小的四个值
# f6,这四个值对应的索引
f5,f6 = tf.nn.top_k(f4,k=4)

with tf.Session() as sess:
p4 = sess.run(f4,feed_dict={trainDataInput:trainData,testDataInput:testData[0:5]})
print('p4=',p4.shape)
print('p4[0,0]=',p4[0,0])

# p5= (5, 4),每一张测试图片(共5张),分别对应4张最近训练图片,共20张
p5,p6 = sess.run((f5,f6),feed_dict={trainDataInput:trainData,testDataInput:testData[0:5]})

print('p5=',p5.shape)
print('p6=',p6.shape)
print('p5',p5)
print('p6',p6)
p4= (5, 5000)
p4[0,0]= -107.035324
p5= (5, 4)
p6= (5, 4)
p5 [[-58.270588  -63.31764   -66.56078   -66.59606  ]
 [-50.70195   -59.564705  -60.10588   -60.713737 ]
 [-10.211766  -13.3529415 -13.843139  -14.133332 ]
 [-24.886272  -35.011753  -36.38429   -36.733334 ]
 [ -8.498037   -9.266665  -11.807843  -12.474509 ]]
p6 [[3015 3148 3455 3798]
 [4024  937 4708 4898]
 [2627 4520 4514 3382]
 [1312 4535 1769 3221]
 [2512 4388 2169 2942]]

4 确定K个图片在类型出现的概率

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
# 根据索引找到对应的标签值
f7 = tf.gather(trainLabelInput,f6)

# 累加维度1的数值
f8 = tf.reduce_sum(f7,reduction_indices=1)

# 返回的是f8中的最大值的索引号
f9 = tf.argmax(f8,dimension=1)

with tf.Session() as sess:
p7 = sess.run(f7,feed_dict={trainDataInput:trainData,testDataInput:testData[0:5],trainLabelInput:trainLabel})
print('p7=',p7.shape)
print('p7[]',p7)

p8 = sess.run(f8,feed_dict={trainDataInput:trainData,testDataInput:testData[0:5],trainLabelInput:trainLabel})
print('p8=',p8.shape)
print('p8[]=',p8)

p9 = sess.run(f9,feed_dict={trainDataInput:trainData,testDataInput:testData[0:5],trainLabelInput:trainLabel})
print('p9=',p9.shape)
print('p9[]=',p9)
p7= (5, 4, 10)
p7[] [[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
  [0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]]

 [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
  [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]

 [[0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
  [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]]

 [[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
  [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]]]
p8= (5, 10)
p8[]= [[0. 0. 0. 0. 4. 0. 0. 0. 0. 0.]
 [0. 0. 4. 0. 0. 0. 0. 0. 0. 0.]
 [0. 4. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 4. 0. 0. 0.]
 [0. 4. 0. 0. 0. 0. 0. 0. 0. 0.]]
p9= (5,)
p9[]= [4 2 1 6 1]

5 检验结果

1
2
3
4
5
6
7
8
9
10
11
12
with tf.Session() as sess:
# p9=p10,代表正确
p10 = np.argmax(testLabel[0:5], axis=1)
print('p10[]=', p10)

count = 0
for i in range(0, 5):
if p10[i] == p9[i]:
count = count+1

# 正确率
print('ac=', j*100/5)
p10[]= [4 2 1 6 1]
ac= 100.0