讓AI網路自己學會下五子棋
阿新 • • 發佈:2018-12-09
最近學AI,想做點什麼,後面想想能不能做一個相對簡單的AI,讓AI自己學會下五子棋呢!我不想寫任何邏輯,我只告訴AI,你這樣輸了哦。也就是反饋,獎懲的方式,也叫深度強化學習吧(DQN)。我做得很簡單,2個一樣的網路,一個判斷結果,讓2個網路自己對抗,策略是部分按訓練結果來下,部分按隨機概率來下(用來探索最佳答案)。訓練過程發現損失越來越少,下棋週期越來越長,棋子會相對離散。這裡輸出“0”表示先手,“*”表示後手。
因為朋友的AI專案開始了,我後面沒有跟進這個東西了。有興趣的朋友完善下,一起交流!
import tensorflow as tf import numpy as np import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' k = 0; M = 10 N = 10 chessState = {} def getTrainData(): tmp = [] for i in range( M ): one = [] for j in range( N ): one.append( chessState[i, j] ) tmp.append( one ) return np.reshape( tmp, [-1, 10] ) def getOneEpData(): tmp = [] for i in range( len( OneRePlay ) ): tmp.append() def initState(): for i in range( M ): for j in range( N ): chessState[i, j] = -1 # chessState[5,5]=1 def outPutChess(): print("{}".format(getTrainData()).replace("-1"," ").replace("1","*")) def outputState(): for i in range( M ): print( "" ) print( "{:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d} {:2d}".format( chessState[i, 0], chessState[i, 1], chessState[i, 2], chessState[i, 3], chessState[i, 4], chessState[i, 5], chessState[i, 6], chessState[i, 7], chessState[i, 8], chessState[i, 9] ).replace("-1","-").replace("1","*") ) def PlayOneStep(L=0., who=-1, sess=tf.Session): i1, j1 = 0, 0 p1, p2 = 0.00, 0.000 for i in range( M ): # 所有點測試都嘗試 for j in range( N ): if (chessState[i, j] == -1): chessState[i, j] = who p1 = sess.run( L, feed_dict={x: np.reshape( getTrainData(), [-1, 100] )} ) if (p1[0][0] > p2): i1, j1 = i, j p2 = p1[0][0] chessState[i, j] = -1 if (True):#np.random.uniform() < 1-p2): # 隨機嘗試不同的地方 return i1, j1 else: while True: i1 = np.random.randint( 0, 9 ) j1 = np.random.randint( 0, 9 ) if (chessState[i1, j1] == -1): break return i1, j1 def Normalize(data): m = np.mean(data) mx = max(data) mn = min(data) return [(float(i) - m) / (mx - mn) for i in data] def whoWin(who=1): i, j = 0, 0 h, v, p, l = 0, 0, 0, 0 for i in range( M ): for j in range( N ): if (chessState[i, j] == who): h, v, p, l = 1, 1, 1, 1 for m in range( j + 1, N ): # h方向- if (chessState[i, m] == who): h += 1 if (h >= 5): return True, "—" else: h = 0 for m in range( i + 1, M ): # V方向| if (chessState[m, j] == who): v += 1 if (v >= 5): return True, "|" else: v = 0 for m in range( 1, M - j ): # L方向\ if (i + m >= M): break if (j + m >= M): break if (chessState[i + m, j + m] == who): p += 1 if (p >= 5): return True, "\\" else: p = 0 for m in range( 1, M - i ): # P方向/(1,10) if (i + m >= M): break if (j - m < 0): break if (chessState[i + m, j - m] == who): l += 1 if (l >= 5): return True, "/" else: l = 0 return False, "" initState() learning_rate = 0.0001 """ chessState[1,5]=1 chessState[1,6]=1 chessState[1,7]=1 chessState[1,8]=1 chessState[1,9]=1 print("1111") print(whoWin(1)) print(getTrainData()) exit() chessState[1,9]=-1 chessState[2,8]=-1 chessState[3,7]=-1 chessState[4,6]=-1 chessState[5,5]=-1 print(whoWin(-1)) exit(0) """ #print( getTrainData() ) outPutChess(); # 先手網路 x = tf.placeholder( dtype=tf.float32, shape=[None, 100], name="X_In" ) y = tf.placeholder( dtype=tf.float32, shape=[None, 1], name="Y_In" ) w1 = tf.get_variable( "W1", shape=[100, 40], initializer=tf.contrib.layers.xavier_initializer() ) b1 = tf.get_variable( "b1", shape=[40], initializer=tf.contrib.layers.xavier_initializer() ) w2 = tf.get_variable( "W2", shape=[40, 1], initializer=tf.contrib.layers.xavier_initializer() ) # np.random.uniform(0,1,size=[50,1])) b2 = tf.get_variable( "b2", shape=[1], initializer=tf.contrib.layers.xavier_initializer() ) L1 = tf.matmul( x, w1 ) + b1 L2_R = tf.matmul( L1, w2 ) + b2 L2 = tf.nn.sigmoid( L2_R ) # loglik = tf.log(y * (y - L2) + (1 - y) * (y + L2)) # los = -tf.reduce_mean(loglik) #los = -tf.reduce_mean( y * tf.log( L2_R ) ) los=tf.reduce_mean(tf.square(L2_R-y)) #los = tf.nn.softmax_cross_entropy_with_logits_v2(logits=L2,labels=y) # los=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=L2, labels=y)) train = tf.train.AdamOptimizer( learning_rate ).minimize( los ) # train=tf.train.GradientDescentOptimizer(0.0001).minimize(los) # 後手網路 w1_h = tf.get_variable( "w1_h", shape=[100, 40], initializer=tf.contrib.layers.xavier_initializer() ) b1_h = tf.get_variable( "b1_h", shape=[40], initializer=tf.contrib.layers.xavier_initializer() ) w2_h = tf.get_variable( "w2_h", shape=[40, 1], initializer=tf.contrib.layers.xavier_initializer() ) b2_h = tf.get_variable( "b2_h", shape=[1], initializer=tf.contrib.layers.xavier_initializer() ) L1_h = tf.matmul( x, w1_h ) + b1_h L2_h_R = tf.matmul( L1, w2_h ) + b2_h L2_h = tf.nn.sigmoid( L2_h_R ) # loglik_h = tf.log(y * (y - L2_h) + (1 - y) * (y + L2_h)) # los_h = -tf.reduce_mean(loglik_h) #los_h = -tf.reduce_mean( y * tf.log( L2_h_R ) ) los_h=tf.reduce_mean(tf.square(L2_h_R-y)) #los_h = tf.nn.softmax_cross_entropy_with_logits_v2(logits=L2_h,labels=y) # los_h=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=L2_h, labels=y)) # train_h=tf.train.GradientDescentOptimizer(0.001).minimize(los_h) train_h = tf.train.AdamOptimizer( learning_rate ).minimize( los_h ) epCount = 0 AllReplay = [] AllReward = [] AllRewardH = [] OneRePlay = [] OneReward = [] OneRewardH = [] OneRePlay.append( getTrainData() ) OneReward.append( 1 ) OneRewardH.append( 1 ) step = 0 who = 1 # 後手 with tf.Session() as sess: init = tf.global_variables_initializer() sess.run( init ) while True: oneTran = getTrainData() if (who == 1): i, j = PlayOneStep( L2, who, sess ) # 位置 else: i, j = PlayOneStep( L2_h, who, sess ) # 位置 chessState[i, j] = who done, posWin = whoWin( who ) step += 1 if (done): print( "------【{:1d}】----------------{:6d}----------【{:s}】----step{:d}--------".format( who, epCount, posWin, step ).replace("-1","-") ) step = 0 #print( getTrainData() ) #outputState() outPutChess() done = True iMax = len( OneReward ) if (who == 1): # 先手贏了 OneReward[iMax - 1] = 0.96 for i in reversed( range( iMax - 1 ) ): OneReward[i] *= OneReward[i + 1] * 0.995 OneRewardH[iMax - 1] = 0.10 for i in reversed( range( iMax - 1 ) ): OneRewardH[i] *= OneRewardH[i + 1] * 1.02 else: # 後手贏了 OneRewardH[iMax - 1] = 0.96 for i in reversed( range( iMax - 1 ) ): OneRewardH[i] *= OneRewardH[i + 1] * 0.995 OneReward[iMax - 1] = 0.10 for i in reversed( range( iMax - 1 ) ): OneReward[i] *= OneReward[i + 1] * 1.02 AllReplay.append( OneRePlay ) AllReward.append( OneReward ) AllRewardH.append( OneRewardH ) initState() # 重新開始 # print(getTrainData()) if (len( AllReplay ) > 0):# and done): # 更新梯度, x_feed = np.vstack( AllReplay ) x_feed = np.array( x_feed ) x_feed = np.reshape( x_feed, [-1, 100] ) r = np.hstack( AllReward ) r = np.array( r ) rh = np.hstack( AllRewardH ) rh = np.array( rh ) _, tlos1, ww1, ww2 = sess.run( [train, los, w1, w2], feed_dict={x: x_feed, y: np.reshape( r, [-1, 1] )} ) _, tlos2 = sess.run( [train_h, los_h], feed_dict={x: x_feed, y: np.reshape( rh, [-1, 1] )} ) if (step % 10 == 0): print( "los1,los2:", tlos1, tlos2 ) # print("w1,w2:",ww1,ww2) if (done): # 一輪結束 OneReward = [] OneRewardH = [] OneRePlay = [] OneRePlay.append( getTrainData() ) OneReward.append( 1 ) OneRewardH.append( 1 ) if (who == 1): who = 0 elif (who == 0): who =1 epCount += 1 if (len( AllReplay ) > 50): AllReplay.pop() AllReward.pop() AllRewardH.pop() # if(epCount%10==0):#看看啥情況 # print(getTrainData()) # outputState()
訓練過程如下:
訓練