【luxuriance19】TF学习笔记总结-week6暂时完结,不定期回补

人工智能
数据科学
机器学习

#21

week6实践——catch game Q-learning。

依赖库

# 用来loading和saving models。
import json 
import matplotlib.pyplot as plt
import numpy as np
import time
from PIL import Image
from IPython import display # 为了render the frames 渲染frames
import seaborn
from collections import deque # 双端队列

%matplotlib inline
seaborn.set()

游戏的界面,状态,执行动作之后的状态的变动,也就是游戏的定义:

# 设置游戏过程:
class Catch(object):
    """
    class Catch是真实的游戏过程
    In the game, white tiles(白色的砖块) 从顶部掉落
    goal:通过basket(用white tiles代替)来接fruits。
    action: left:0, stay: 1, right: 2
    """
    
    def __init__(self, grid_size=10):
        self.grid_size = grid_size
        self.reset()
        
    def _update_state(self, action):
        """
        state: f0,f1,basket
            f0,f1: 表示砖块下落的位置
            basktet: 表示basket的位置
        输入: states and actions
        输出: new states and reward
        """
        
        state = self.state
        if action == 0: # 0: left
            action = -1
        elif action == 1: # 1: stay
            action = 0
        else:
            action = 1
            
        f0, f1, basket = state[0]
        # 接砖块的篮子必须放置在grid_size的范围之内,这边要求的范围是[1, self.grid_size-1]
        new_basket = min(max(1, basket+action), self.grid_size - 1)
        f0 += 1
        out = np.asarray([f0, f1, new_basket])
        out = out[np.newaxis] #等同于 out[np.newaxis,:]
        
        assert len(out.shape) == 2
        self.state = out
        
    def _draw_state(self):
        """
        给出游戏界面
        """
        im_size = (self.grid_size,)*2
        state = self.state[0]
        canvas = np.zeros(im_size)
        # 画下落的白色砖块的位置
        canvas[state[0], state[1]] = 1
        # 画下面接白色砖块的篮子的位置,在图片的底部,篮子的宽度是3个white tile的宽度
        canvas[-1, state[2]-1: state[2]+2] = 1 
        return canvas
    
    def _get_reward(self):
        """
        回馈reward
        """
        fruit_row, fruit_col, basket = self.state[0]
        if fruit_row == self.grid_size-1:
            if abs(fruit_col - basket) <= 1:
                return 1
            else:
                return -1
        else:
            return 0
        
    def _is_over(self):
        if self.state[0,0] == self.grid_size-1:
            return True
        else:
            return False
    
    def observe(self):
        canvas = self._draw_state()
        return canvas.reshape((1, -1))
    
    def act(self, action):
        self._update_state(action)
        reward = self._get_reward()
        game_over = self._is_over()
        return self.observe(), reward, game_over
    
    def reset(self):
        n = np.random.randint(0, self.grid_size-1, size=1) #np.random.randint(low,high),[low,high)
        m = np.random.randint(1, self.grid_size-2, size=1) #这里只能够是[1,grid_size-2)的原因是因为basket的长度是3
        self.state = np.asarray([0,n,m])[np.newaxis]

可视化显示的utils:

"""
定义一些variables,用可视化显示
"""
# last_frame_time: 跟踪我们到达哪个frame
last_frame_time = 0
# 将actions转成可读文字
translate_action = ["Left", "Stay", "Right", "Creat Ball", "End Test"]
# 游戏屏幕尺寸(size of the game field)
grid_size = 10

def display_screen(action, points, input_t):
    """
    用于显示game screen(render the game screen)
    """
    
    global last_frame_time
    # print(action)
    print("Action %s, Points: %d"%(translate_action[action],points))
    
    # 只在游戏没有结束的时候显示game screen
    if ("End" not in translate_action[action]):
        plt.imshow(input_t.reshape((grid_size,)*2), interpolation='none', cmap='gray')
        # 删除之前显示的图片,等到接下来的图片可以替代之前的图片的时候
        display.clear_output(wait=True)
        # 显示现在的图片
        display.display(plt.gcf()) #plt.gcf()获得现在图片的查阅
        
    last_frame_time = set_max_fps(last_frame_time,5)

# 之前的函数优点问题,做了细微的修改之后可以运行,这里*1000是为了保证精度吗,
# 自己理解的这个函数的作用是:为了保证每秒最多有FPS帧画面
# FPS是frames per second
# time.time()返回浮点秒数(从1970年开始计数)
# 貌似感觉上面这种实现方法会要稳定一点
def set_max_fps(last_frame_time, FPS=1): 
    current_milli_time = lambda: int(round(time.time()*1000))
    sleep_time = 1.0/FPS - (current_milli_time() - last_frame_time)/1000.
    # print(sleep_time)
    if sleep_time > 0:
        time.sleep(sleep_time)
    return current_milli_time()
    '''
    current_milli_time = time.time()
    sleep_time = 1.0/FPS - (current_milli_time - last_frame_time)
    print(sleep_time)
    if sleep_time > 0:
        time.sleep(sleep_time)
    return current_milli_time
    '''

游戏学习的主要过程,设置了experience replay的过程,这里实际上面实现了在Q-learning的过程中,从以前的经验中抽样(inputs, outputs)的过程,在抽样的过程当中,模型不断学习,不断的更新experience memory。

"""
NN的输入:<s,a>
NN的输出:Q(s,a)

training process:
experineces: <s,a,r,s'>(s:current state, a:current action, r: instant reward, s': following state)
1. 对每个action计算Q(s',a')(Q值,也就是与state和action都有关的value function)
2. 选取这三个不同action的最大的Q值
3. 计算带有discount factor(衰减因子):gamma的总的Q值:Q(s,a) = r + gamma*max(Q(s',a'))。这个就是神经网络的目标值
4. 利用loss function: 1//2*(predict_Q(s,a)-target)^2

所有的experience放置在replay memory当中
"""
class ExperienceReplay(): # Python3自动继承object类,所以我后面基本上都省略了
    """
    在gameplay时期的所有的experience<s,a,r,s'>都自动存储在replay memory当中,这里memory是列表
    在训练的时候,在replay memory中的随机抽样batches的experiences作为input和output用来训练
    """
    
    def __init__(self, max_memory=100, discount=.9):
        """
        max_memory: 可以存储的experiences的最大长度
        memory:list of experiences,元素是[experience, game_over(bool)] ,list of list,experience有四个元素
        experience: [游戏初始界面,action, reward, 随后的游戏界面]
        discount: discount factor
        """
        #============================================================================
        # self.max_memory = max_memory
        # self.memory = []
        #============================================================================
        self.memory = deque(maxlen=max_memory)
        self.discount = discount
        
    def remember(self, states, game_over):
        # 将state存储在memory当中
        self.memory.append([states, game_over])
        
        #==============================================================================
        # 这里list是可变长的,所以,需要删除最大长度的,为啥我在这里不用collecttion.queue()来实现呢
        # 简要查了一下相关的,暂时没有找到详细的list和queue的实现,没有比较他们的计算复杂度
        # if len(self.memory) > self.max_memory:
        #     del self.memory[0]
        #==============================================================================
        
    def get_batch(self, model, batch_size=10):
        """
        len_memory: 存储了多少experience
        num_actions: 计算在game中有多少中actions可以被采取
        env_dim: game field的维度
        inputs:batches of inputs([cur_state,action,reward, follwed_state])
        targets: batches of 目标函数
        """
        len_memory = len(self.memory)
        num_actions = model.output_shape[-1]
        # print(self.memory)
        # memory的构成是list of experience,在Catch.obeserve()返回当前图片的时候已经将其转换成一维的
        # 向量,所以这里实际上就是grid_size**2
        env_dim = self.memory[0][0][0].shape[1]
        # print(env_dim)
        inputs = np.zeros((min(len_memory, batch_size), env_dim))
        targets = np.zeros((inputs.shape[0], num_actions))
        
        for i, idx in enumerate(np.random.randint(0, len_memory, size=inputs.shape[0])):
            """
            state_t: initial state s
            action_t: action taken a
            reward_t: reward earned r
            state_tpl: the state that followed s'
            """
            state_t, action_t, reward_t, state_tpl = self.memory[idx][0]
            game_over = self.memory[idx][1]
            # inputs[i:i+1] = state_t
            inputs[i] = state_t
            # print(model.predict(state_t).shape)
            targets[i] = model.predict(state_t)[0] # model.predict(state_t) 输出的维度是[1,3]
            
            Q_sa = np.max(model.predict(state_tpl)[0])
            
            if game_over:
                targets[i, action_t] = reward_t
            else:
                targets[i, action_t] = reward_t + self.discount * Q_sa
                
        return inputs, targets      

构建简单的三层的Q-learning的训练模型:

"""
Q-learning: 三层的dense network。
"""
from keras.models import model_from_json
from keras.models import Sequential
from keras.layers.core import Dense
from keras.optimizers import sgd

def baseline_model(grid_size, num_actions, hidden_size):
    model = Sequential()
    model.add(Dense(hidden_size, input_shape=(grid_size**2,), activation="relu"))
    model.add(Dense(hidden_size, activation="relu"))
    model.add(Dense(num_actions))
    model.compile(sgd(lr=.1), "mse") ## 这里的loss就是均方误差
    return model

"""
模型参数
"""
num_actions = 3
hidden_size = 100
grid_size = 10

model = baseline_model(grid_size, num_actions, hidden_size)
model.summary()

Deep Q-Network算法

Initialize replay memory D to size N
Initialize action-value function Q with random weights
for episode = 1, M do
    Initialize state s_1
    for t = 1, T do
        With probability ϵ select random action a_t
        otherwise select a_t=max_a  Q(s_t,a; θ_i)
        Execute action a_t in emulator and observe r_t and s_(t+1)
        Store transition (s_t,a_t,r_t,s_(t+1)) in D
        Sample a minibatch of transitions (s_j,a_j,r_j,s_(j+1)) from D
        Set y_j:=
            r_j for terminal s_(j+1)
            r_j+γ*max_(a^' )  Q(s_(j+1),a'; θ_i) for non-terminal s_(j+1)
        Perform a gradient step on (y_j-Q(s_j,a_j; θ_i))^2 with respect to θ
    end for
end for
"""
Q-learning的全部的过程,(这里实际上是训练一个策略policy,也就是policy gradient)
DQN(输入是state,输出是每个action的value值,这里输出的这个value值是对应的在这个状态采取这个动作之后总共获得的value的值)
DQN-输入:state,输出:在这个state能够采取不同的action能够获得的最大的value值

这里类似与利用model作为一个simulator(输入是state,输出是action,也就是对DQN的输出取value最大的action),
在simulator的过程当中为了防止只依赖过去的行为而导致之后的行为和之前的行为高度相关,除了利用这个model来做动作的预测之外,
还添加了一些西奥的抖动,也就是随机选取一些动作的过程。这些模拟的动作的过程都存储到experience replay当中(memory当中)

在训练的时候,每次从experience_replay当中取值,目标值就是最大的从当前这个状态开始到游戏结束所获得的总的value值。
模型训练的目的:然个model的输出逼近在这个state能够获得的最大的所有的reward的值。

loss:利用的是MSE
"""
def train(model, epochs, verbose=1):
    """
    训练参数
    """
    epsilon = .1
    max_memory = 500
    batch_size = 1
    
    env = Catch(grid_size)
    exp_replay = ExperienceReplay(max_memory=max_memory)
    
    win_cnt = 0
    win_hist = []
    for _ in range(epochs):
        loss = 0.
        env.reset()
        game_over = False
        input_t = env.observe() # 输入是当前游戏的状态图片
        
        while not game_over:
            input_tml = input_t
            
            # 为了防止卡在local minimum,这里加一个小的抖动,避免和之前的行为高度相关,而学习不到其他的行为,而找不到最优解
            # 这边添加的experience replay不也是这个目的吗(replay memory)?
            if np.random.rand() <= epsilon:
                action = np.random.randint(0, num_actions, size=1)[0]
            else:
                q = model.predict(input_tml)
                action = np.argmax(q[0]) # np.argmax()存在相同的最大值,返回第一个最大值位置的index
                
            # print(action)
                
            input_t, reward, game_over = env.act(action)
            if reward == 1:
                win_cnt += 1
                
            
            # 如果想要可视化训练过程,取消注释
            # display_screen(action, 3000, input_t)
            
            exp_replay.remember([input_tml, action, reward, input_t], game_over)
            
            inputs, targets = exp_replay.get_batch(model, batch_size=batch_size)
            
            batch_loss = model.train_on_batch(inputs, targets)
            
            loss += batch_loss
            
        if verbose > 0:
            print("Epoch {:03d}/{:03d} | Loss {:.4f} | Win count {}".format(e,epochs, loss, win_cnt))
        win_hist.append(win_cnt)
        
    return win_hist   

训练过程加测试函数

# 训练过程
# playing many games
epoch = 5000
hist = train(model, epoch, verbose=0)
print("Training done")

model.save_weights("model.h5", overwrite=True)
with open("model.json","w") as outfile:
    json.dump(model.to_json(), outfile)

# Testing
def test(model):
    grid_size = 10
    global last_frame_time
    # plt.ion() #打开交互模式(在console当中是默认交互模式,但是在python脚本当中是默认阻塞模式)
    env = Catch(grid_size)
    c = 0
    last_frame_time = 0
    points = 0
    
    for _ in range(10):
        loss = 0.
        env.reset()
        game_over = False
        input_t = env.observe()
        display_screen(3,points, input_t)
        plt.imshow(input_t.reshape((grid_size,)*2),interpolation='none', cmap='gray')
        plt.savefig("%03d.png"%c)
        c += 1
        while not game_over:
            input_tml = input_t
            q = model.predict(input_tml)
            action = np.argmax(q[0])
            input_t, reward, game_over = env.act(action)
            points += reward
            display_screen(action, points, input_t)
            plt.imshow(input_t.reshape((grid_size,)*2), interpolation='none', cmap='gray')
            plt.savefig("%03d.png"%c)
            c += 1
    display_screen(4,points,input_t)

#22

这一部分是GIF切分以及PNG保存成GIF部分

"""
gif图合成
"""
import imageio
def create_gif(image_list, gif_name):
    frames = []
    for image_name in image_list:
        frames.append(imageio.imread(image_name))
    # Save them as frames into gif
    imageio.mimsave(gif_name, frames, 'GIF', duration=0.1) # duration:每一帧的间隔
    
    return

def main():
    image_list = ["%03d.png"%i for i in range(100)]
    gif_name = "catch_game.gif"
    create_gif(image_list, gif_name)
    
if __name__ == "__main__":
    main()
"""
从gif图中截取图片
"""
import os
from PIL import Image
# im.title = [(tag,(0,0)+im.size,offset, extra)]
def analyseImage(path):
    im = Image.open(path)
    results = {'size':im.size, 'mode':"full"}
    try:
        while True:
            if im.tile: # im.tile是2-tuple的整数
                # print(im.tile) #[('gif', (0, 0, 432, 288), 819, (8, False))]
                tile = im.tile[0]
                update_region = tile[1]
                update_region_dimensions = update_region[2:]
                if update_region_dimensions != im.size:
                    results["mode"] = "partial"
                    break
            im.seek(im.tell()+1)
    except EOFError:
        pass
    return results

def processImage(path):
    """
    迭代GIF,获得每帧
    """
    mode = analyseImage(path)["mode"]
    
    im = Image.open(path)
    
    i = 0
    p = im.getpalette() # 返回image palette的list,也就是一个list的collor的值
    last_frame = im.convert('RGBA')
    
    try:
        while True:
            print("saving %s (%s) frame %d, %s %s" % (path, mode, i, im.size, im.tile) )
            '''
            如果说GIF用本地color table,每个帧会有自己的palette, 如果不是,我们需要apply global palette到新的frame
            '''
            if not im.getpalette():
                im.putpalette(p)
                
            new_frame = Image.new("RGBA",im.size)
            """
            如果说是"partial"mode, 这个时候每帧update的region不是原来的图片大小,所以需要构建一个在原来的图片上面的新的frame
            """
            if mode == "partial":
                new_frame.paste(last_frame)
                
            new_frame.paste(im, (0,0), im.convert("RGBA")) # Image.paste(im, box=None, mask=None):box:4-tuple,如果是2-tuple代表左上角,如果一个图片作为第二个参数,并且没有第三个参数,box默认是(0,0)
            new_frame.save('Datageek/%s-%d.png' % (''.join(os.path.basename(path).split('.')[:-1]), i), 'PNG') #这里去除后面的.gif
            
            i += 1
            last_frame = new_frame
            # 读取gif图,支持seek()和tell()方法,查找下一帧通过im.seek(im.tell()+1),或者是逆序查找到第一帧,但是不支持随机帧搜索
            # im.seek()当seek到最后一帧的时候会raise一个EOFError。
            im.seek(im.tell()+1)
            
    except EOFError:
        pass
    
def main():  
    processImage('catch_game.gif')  
    
if __name__ == "__main__":
    main()

#23

Flappy Bird实践:

这里所有图片的坐标,都是值的图片的左上角的像素点的坐标的位置,以左上角的坐标未知为基准,向右和向下展开。
flappy_bird_utils.py:
这个文件是读入图片的,都用pygame.image读入图片,生成Surface对象

import pygame
import sys
def load():
    """
    存储类型{key:value(tuple)}
    IMAGES: 所有的游戏的背景的图片
    SOUNDS: 游戏的背景声音
    HITMASKS: 如果说=0,说名没有撞到管道,每一个图片是用嵌套list存储的
    """
    # path of player with different states
    # 这里载入了小鸟飞行时候的三种不同的状态,一个图片代表一个飞行的状态
    PLAYER_PATH = (
            'assets/sprites/redbird-upflap.png', # 向上飞行
            'assets/sprites/redbird-midflap.png', # 水平飞行
            'assets/sprites/redbird-downflap.png' # 向下飞行
    )

    # path of background
    # 背景颜色都是黑色的
    BACKGROUND_PATH = 'assets/sprites/background-black.png'

    # path of pipe
    # 需要越过的管道
    PIPE_PATH = 'assets/sprites/pipe-green.png'

    IMAGES, SOUNDS, HITMASKS = {}, {}, {}

    # numbers sprites for score display
    # 这里是分数显示的图片,通过pygame.image.load()导入
    # pygame.image.load()的参数可以是filename,也可以是Python file-like对象,Pygame会自动的决定image的type(例如GIF或者bitmap),并且从data中创建一个新的Surface object,
    # 如过说传递的是一个file-like对象,那么序奥另外一个参数load(fileobj,namehint="")来高素原来的图片是什么格式的,也就是需要标明后缀。
    # 对于这个load()返回的的对象会包含和这个file相同的color format,colorkey还有alpha透明度。添加Surface.convert()不需要任何参数,可以创建一个复制版本,让图片在screen上面更新更快
    # 对于alpha transparency(这里认为是图片的透明度),类似与在.png images,使用convert_alpha()让图片有per pixel transparency
    IMAGES['numbers'] = (
        pygame.image.load('assets/sprites/0.png').convert_alpha(),
        pygame.image.load('assets/sprites/1.png').convert_alpha(),
        pygame.image.load('assets/sprites/2.png').convert_alpha(),
        pygame.image.load('assets/sprites/3.png').convert_alpha(),
        pygame.image.load('assets/sprites/4.png').convert_alpha(),
        pygame.image.load('assets/sprites/5.png').convert_alpha(),
        pygame.image.load('assets/sprites/6.png').convert_alpha(),
        pygame.image.load('assets/sprites/7.png').convert_alpha(),
        pygame.image.load('assets/sprites/8.png').convert_alpha(),
        pygame.image.load('assets/sprites/9.png').convert_alpha()
    )

    # base (ground) sprite
    IMAGES['base'] = pygame.image.load('assets/sprites/base.png').convert_alpha()

    # sounds
    # 以下代码查看是哪个运行平台
    if 'win' in sys.platform:
        soundExt = '.wav'
    else:
        soundExt = '.ogg'

    # pygame.mixer.Sound()可以读入file或者是buffer object创建一个新的声音对象,
    # 关键字是pygame.mixer.Sound(file=filename)/buffer=buffer 官方文档说加关键字不容以弄混,这边提到的audio file只说的ogg和wav后缀的文件
    SOUNDS['die']    = pygame.mixer.Sound('assets/audio/die' + soundExt)
    SOUNDS['hit']    = pygame.mixer.Sound('assets/audio/hit' + soundExt)
    SOUNDS['point']  = pygame.mixer.Sound('assets/audio/point' + soundExt)
    SOUNDS['swoosh'] = pygame.mixer.Sound('assets/audio/swoosh' + soundExt)
    SOUNDS['wing']   = pygame.mixer.Sound('assets/audio/wing' + soundExt)

    # select random background sprites
    IMAGES['background'] = pygame.image.load(BACKGROUND_PATH).convert()

    # select random player sprites
    IMAGES['player'] = (
        pygame.image.load(PLAYER_PATH[0]).convert_alpha(),
        pygame.image.load(PLAYER_PATH[1]).convert_alpha(),
        pygame.image.load(PLAYER_PATH[2]).convert_alpha(),
    )

    # select random pipe sprites
    # pygame.transform.rotate(Surface, angle)->Surface
    # 上下两根管子
    IMAGES['pipe'] = (
        pygame.transform.rotate(
            pygame.image.load(PIPE_PATH).convert_alpha(), 180),
        pygame.image.load(PIPE_PATH).convert_alpha(),
    )

    # hismask for pipes
    HITMASKS['pipe'] = (
        getHitmask(IMAGES['pipe'][0]),
        getHitmask(IMAGES['pipe'][1]),
    )

    # hitmask for player
    HITMASKS['player'] = (
        getHitmask(IMAGES['player'][0]),
        getHitmask(IMAGES['player'][1]),
        getHitmask(IMAGES['player'][2]),
    )

    return IMAGES, SOUNDS, HITMASKS

def getHitmask(image):
    """returns a hitmask using an image's alpha."""
    # image.get_width()返回的是image的宽的像素值
    # image.get_at((x,y)) 获得一个单独的像素点的颜色的值,返回的是RGBA的值(red, green, blue, alpha),如果说没有pixel alpha,那么alpha的值就之一是255
    # mask就是alpha的值
    mask = []
    for x in range(image.get_width()):
        mask.append([])
        for y in range(image.get_height()):
            mask[x].append(bool(image.get_at((x,y))[3]))
    return mask

wrapped_flappy_bird.py
这个文件是定义flappy bird游戏的过程,给定一个动作值之后,游戏界面状态的改变。也是pygame的图片变化的过程,利用Surface.blit()变化图像,这里取消掉了声音的注释,但是感觉配音有延时,不是特别准确的感觉

import numpy as np
import sys
import random
import pygame
import flappy_bird_utils
import pygame.surfarray as surfarray
from pygame.locals import *
from itertools import cycle

FPS = 30
SCREENWIDTH  = 288
SCREENHEIGHT = 512

pygame.init() # 初始化所有的导入的pygame的模块,返回(numpass, numfail)不会有异常抛出,但是如果说单独的模块初始化就会有异常的抛出
FPSCLOCK = pygame.time.Clock() # 创建一个时间对象来跟踪所用的总的时间,这个对象里面也有一些方程可以帮助空值游戏的帧率
SCREEN = pygame.display.set_mode((SCREENWIDTH, SCREENHEIGHT)) # 主要是设置在屏幕中显示额状态,这里的参数是resolution=(0,0),flags=0,depth=0(depth是说每个color存储的bits),返回的是Surface对象
pygame.display.set_caption('Flappy Bird') # 设置现在窗口的名称 参数 title, icontitle=None

IMAGES, SOUNDS, HITMASKS = flappy_bird_utils.load()
PIPEGAPSIZE = 100 # gap between upper and lower part of pipe
BASEY = SCREENHEIGHT * 0.79

PLAYER_WIDTH = IMAGES['player'][0].get_width()
PLAYER_HEIGHT = IMAGES['player'][0].get_height()
PIPE_WIDTH = IMAGES['pipe'][0].get_width()
PIPE_HEIGHT = IMAGES['pipe'][0].get_height()
BACKGROUND_WIDTH = IMAGES['background'].get_width()

PLAYER_INDEX_GEN = cycle([0, 1, 2, 1]) # 鸟飞行的翅膀的未知,上中下中循环,为了让鸟实现动态的效果


class GameState:
    '''
    这里表示所有物件的坐标位置都是取的该物件的左上角像素点的坐标,以左上角的像素点坐标作为标准
    '''
    def __init__(self):
        '''
        self.playerx, self.playery: player的位置,playery代表的是player图片最上面的未知

        '''
        self.score = self.playerIndex = self.loopIter = 0
        self.playerx = int(SCREENWIDTH * 0.2)
        self.playery = int((SCREENHEIGHT - PLAYER_HEIGHT) / 2)
        self.basex = 0
        self.baseShift = IMAGES['base'].get_width() - BACKGROUND_WIDTH #336-288=48

        # 这里upperPipes的参数'x'代表最左边的坐标,'y'代表
        newPipe1 = getRandomPipe() # 返回上下管道的坐标的未知
        newPipe2 = getRandomPipe()
        self.upperPipes = [
            {'x': SCREENWIDTH, 'y': newPipe1[0]['y']},
            {'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[0]['y']},
        ]
        self.lowerPipes = [
            {'x': SCREENWIDTH, 'y': newPipe1[1]['y']},
            {'x': SCREENWIDTH + (SCREENWIDTH / 2), 'y': newPipe2[1]['y']},
        ]

        # player velocity, max velocity, downward accleration, accleration on flap
        # 这里像素点从左上角开始是(0,0),所以计数,向上飞是负值,向下飞是正值
        self.pipeVelX = -4         # pipeVelX: 鸟向右飞,所以管道向左移动
        self.playerVelY    =  0    # player's velocity along Y, default same as playerFlapped
        self.playerMaxVelY =  10   # max vel along Y, max descend speed
        self.playerMinVelY =  -8   # min vel along Y, max ascend speed
        self.playerAccY    =   1   # players downward accleration
        self.playerFlapAcc =  -9   # players speed on flapping
        self.playerFlapped = False # True when player flaps

    def frame_step(self, input_actions):
        '''
        每一词动作之后的图片的状态的变化
        '''
        pygame.event.pump() # 内置进程pygame event handlers,如过不用其他的event function来让程序和操作系统的恶气它部分内置交互,那么应该要调用pygame.event.pump()让pygame空值internal actions

        reward = 0.1
        terminal = False

        if sum(input_actions) != 1:
            raise ValueError('Multiple input actions!')

        # input_actions[0] == 1: do nothing
        # input_actions[1] == 1: flap the bird
        # 鸟有动作的时候,首先判断鸟所处的位置, 当鸟的合适,给鸟y轴飞行的速度
        if input_actions[1] == 1:
            if self.playery > -2 * PLAYER_HEIGHT:
                self.playerVelY = self.playerFlapAcc
                self.playerFlapped = True
                SOUNDS['wing'].play() # 播放音乐

        # check for score
        # playerMidPos:player的x轴的中心未知
        # pipeMidPos: 管道的中心的水平位置
        # 当鸟的中心未知超过管道的中心位置,并且超过中心位置不多与3个像素点的时候,score+1, 这个时候的reward是1
        playerMidPos = self.playerx + PLAYER_WIDTH / 2
        for pipe in self.upperPipes:
            pipeMidPos = pipe['x'] + PIPE_WIDTH / 2
            if pipeMidPos <= playerMidPos < pipeMidPos + 4:
                self.score += 1
                SOUNDS['point'].play()
                reward = 1

        # playerIndex basex change
        # loopIter 取值范围是[0,29],当loopIter每加3, PlayerIndex都发生改变
        # basex每次循环加100,取值范围为[-baseShift, 0],这边basex取非正数
        # 这里变化的目地是为了让鸟看起来有动画的效果,鸟的飞行动作不是每帧都变,而是每三帧变化一次
        if (self.loopIter + 1) % 3 == 0:
            self.playerIndex = next(PLAYER_INDEX_GEN)
        self.loopIter = (self.loopIter + 1) % 30
        self.basex = -((-self.basex + 100) % self.baseShift)

        # player's movement
        # 如果player的y轴的速度没有达到速度的最大值,并且player也没有让鸟飞,那么player的速度加上一个向下的加速度,执行这一过程之后,恢复playerFlapped为False
        # 这一部分是根据动作变更player的y轴的位置,如果说playerVelY是正值的,palyery = playery+playerVelY,这边最大的高度是BASEY嘛?为啥不是SCREENHEIGHT,这边BASEY猜测是FlappyBird的底部
        # 如果说是向上飞,那么不能超过像素0
        if self.playerVelY < self.playerMaxVelY and not self.playerFlapped:
            self.playerVelY += self.playerAccY
        if self.playerFlapped:
            self.playerFlapped = False
        self.playery += min(self.playerVelY, BASEY - self.playery - PLAYER_HEIGHT)
        if self.playery < 0:
            self.playery = 0

        # move pipes to left
        for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
            uPipe['x'] += self.pipeVelX
            lPipe['x'] += self.pipeVelX

        # add new pipe when first pipe is about to touch left of screen
        if 0 < self.upperPipes[0]['x'] < 5:
            newPipe = getRandomPipe()
            self.upperPipes.append(newPipe[0])
            self.lowerPipes.append(newPipe[1])

        # remove first pipe if its out of the screen
        # 当这个管道的已经移除屏幕的时候,将其从pipes这个list中间弹出来
        if self.upperPipes[0]['x'] < -PIPE_WIDTH:
            self.upperPipes.pop(0)
            self.lowerPipes.pop(0)

        # check if crash here
        isCrash= checkCrash({'x': self.playerx, 'y': self.playery,
                             'index': self.playerIndex},
                            self.upperPipes, self.lowerPipes)
        if isCrash:
            SOUNDS['hit'].play()
            SOUNDS['die'].play()
            terminal = True
            self.__init__()
            reward = -1

        # draw sprites
        # .pygame.Surface.blit() 在一张图(dest)上面画另外一张图(source) blit(source,dest,area, special_flags),dest可以是表明图片的左上角的未知的一堆的坐标,也可以是一个Rect,也就是说dest实际上可以表明source放置位置的左上角
        SCREEN.blit(IMAGES['background'], (0,0))

        for uPipe, lPipe in zip(self.upperPipes, self.lowerPipes):
            SCREEN.blit(IMAGES['pipe'][0], (uPipe['x'], uPipe['y']))
            SCREEN.blit(IMAGES['pipe'][1], (lPipe['x'], lPipe['y']))

        SCREEN.blit(IMAGES['base'], (self.basex, BASEY))
        # print score so player overlaps the score
        showScore(self.score)
        SCREEN.blit(IMAGES['player'][self.playerIndex],
                    (self.playerx, self.playery))

        # pygame.surfarray.array3d()->array: 将像素从surface赋值到一个3D array里面
        image_data = pygame.surfarray.array3d(pygame.display.get_surface())
        # pygame.display.update()更新screen的部分显示update(rectangle=),如果说没有参数,就是更新整个Surface,类似pygame.display.flip()
        pygame.display.update()
        # pygame.time.Clock.tick: 更新clock,这个方法必须要每帧调用一次,这里会计算从上一次调用开始过去了多少ms(milliseconds),如果说带参数,类似与这里的tick(FPS)那么,就限制了每秒最大的帧数
        FPSCLOCK.tick(FPS)
        #print self.upperPipes[0]['y'] + PIPE_HEIGHT - int(BASEY * 0.2)
        return image_data, reward, terminal

def getRandomPipe():
    """returns a randomly generated pipe"""
    # 这边管道中间的间隔是固定的,都是PIPEGAPSIZE
    # Pipe的的x,y都取的是左上角的像素点的坐标
    # y of gap between upper and lower pipe
    gapYs = [20, 30, 40, 50, 60, 70, 80, 90]
    index = random.randint(0, len(gapYs)-1)
    gapY = gapYs[index]

    gapY += int(BASEY * 0.2)
    pipeX = SCREENWIDTH + 10

    return [
        {'x': pipeX, 'y': gapY - PIPE_HEIGHT},  # upper pipe
        {'x': pipeX, 'y': gapY + PIPEGAPSIZE},  # lower pipe
    ]


def showScore(score):
    """displays score in center of screen"""
    scoreDigits = [int(x) for x in list(str(score))]
    totalWidth = 0 # total width of all numbers to be printed

    for digit in scoreDigits:
        totalWidth += IMAGES['numbers'][digit].get_width()

    Xoffset = (SCREENWIDTH - totalWidth) / 2

    # 显示分数在右上角
    for digit in scoreDigits:
        SCREEN.blit(IMAGES['numbers'][digit], (Xoffset, SCREENHEIGHT * 0.1))
        Xoffset += IMAGES['numbers'][digit].get_width()


def checkCrash(player, upperPipes, lowerPipes):
    """returns True if player collders with base or pipes."""
    pi = player['index']
    player['w'] = IMAGES['player'][0].get_width()
    player['h'] = IMAGES['player'][0].get_height()

    # if player crashes into ground
    if player['y'] + player['h'] >= BASEY - 1:
        return True
    else:
        # pygame.Rect(left, top, width, height)返回Rect对象
        playerRect = pygame.Rect(player['x'], player['y'],
                      player['w'], player['h'])

        for uPipe, lPipe in zip(upperPipes, lowerPipes):
            # upper and lower pipe rects
            uPipeRect = pygame.Rect(uPipe['x'], uPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)
            lPipeRect = pygame.Rect(lPipe['x'], lPipe['y'], PIPE_WIDTH, PIPE_HEIGHT)

            # player and upper/lower pipe hitmasks
            pHitMask = HITMASKS['player'][pi]
            uHitmask = HITMASKS['pipe'][0]
            lHitmask = HITMASKS['pipe'][1]

            # if bird collided with upipe or lpipe
            uCollide = pixelCollision(playerRect, uPipeRect, pHitMask, uHitmask)
            lCollide = pixelCollision(playerRect, lPipeRect, pHitMask, lHitmask)

            if uCollide or lCollide:
                return True

    return False

def pixelCollision(rect1, rect2, hitmask1, hitmask2):
    """Checks if two objects collide and not just their rects"""
    # rect1.clip(rect2): 返回一个新的rect,这个rect是rect1位于rect2中的部分,如果说两个rect重叠,返回0
    rect = rect1.clip(rect2)

    if rect.width == 0 or rect.height == 0:
        return False

    x1, y1 = rect.x - rect1.x, rect.y - rect1.y
    x2, y2 = rect.x - rect2.x, rect.y - rect2.y

    for x in range(rect.width):
        for y in range(rect.height):
            if hitmask1[x1+x][y1+y] and hitmask2[x2+x][y2+y]:
                return True
    return False

deep_q_network.py
程序的重磅部分,这里是DQN的训练部分,在这个代码中,它没有将每一个游戏的episode作为一次训练,而是不断的训练。
这个DQN用了三层的卷积层,但是只有第一层用了max_pooling,其他的只是卷积,激活函数都用的是Relu。然后接了两层的全连接。而且卷积和max_pooling都采用"SAME"的方法。

#!/usr/bin/env python
from __future__ import print_function

import tensorflow as tf
import cv2
import sys
sys.path.append("game/")
import wrapped_flappy_bird as game
import random
import numpy as np
from collections import deque

GAME = 'bird' # the name of the game being played for log files
ACTIONS = 2 # number of valid actions
GAMMA = 0.99 # decay rate of past observations
OBSERVE = 10000. # timesteps to observe before training
EXPLORE = 3000000. # frames over which to anneal epsilon
FINAL_EPSILON = 0.0001 # final value of epsilon
INITIAL_EPSILON = 0.1 # starting value of epsilon
REPLAY_MEMORY = 50000 # number of previous transitions to remember
BATCH = 32 # size of minibatch
FRAME_PER_ACTION = 1

def weight_variable(shape):
    initial = tf.truncated_normal(shape, stddev = 0.01)
    return tf.Variable(initial)

def bias_variable(shape):
    initial = tf.constant(0.01, shape = shape)
    return tf.Variable(initial)

def conv2d(x, W, stride):
    return tf.nn.conv2d(x, W, strides = [1, stride, stride, 1], padding = "SAME") #这里都做了填充,所以输出的shape是ceil(width/stride)

def max_pool_2x2(x):
    return tf.nn.max_pool(x, ksize = [1, 2, 2, 1], strides = [1, 2, 2, 1], padding = "SAME")

def createNetwork():
    '''
    网络结构,三层卷积,两层全连接
    '''
    # network weights
    W_conv1 = weight_variable([8, 8, 4, 32])
    b_conv1 = bias_variable([32])

    W_conv2 = weight_variable([4, 4, 32, 64])
    b_conv2 = bias_variable([64])

    W_conv3 = weight_variable([3, 3, 64, 64])
    b_conv3 = bias_variable([64])

    W_fc1 = weight_variable([1600, 512])
    b_fc1 = bias_variable([512])

    W_fc2 = weight_variable([512, ACTIONS])
    b_fc2 = bias_variable([ACTIONS])

    # input layer
    s = tf.placeholder("float", [None, 80, 80, 4])

    # hidden layers
    h_conv1 = tf.nn.relu(conv2d(s, W_conv1, 4) + b_conv1) # outshape:20×20
    h_pool1 = max_pool_2x2(h_conv1)

    h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2, 2) + b_conv2) # outshape: 5×5
    #h_pool2 = max_pool_2x2(h_conv2)

    h_conv3 = tf.nn.relu(conv2d(h_conv2, W_conv3, 1) + b_conv3)
    #h_pool3 = max_pool_2x2(h_conv3)

    #h_pool3_flat = tf.reshape(h_pool3, [-1, 256])
    h_conv3_flat = tf.reshape(h_conv3, [-1, 1600])

    h_fc1 = tf.nn.relu(tf.matmul(h_conv3_flat, W_fc1) + b_fc1)

    # readout layer
    readout = tf.matmul(h_fc1, W_fc2) + b_fc2

    return s, readout, h_fc1

def trainNetwork(s, readout, h_fc1, sess):
    '''
    a:  当前状态采取的动作,有两个动作,[1,0]表示do nothind,[0,1]表示鸟向上飞行
    y: 当前状态获得的Q值的目标值
    s: 表示当前状态的网络的输入(DQN的input)

    DQN: output与y值越接近越好
    '''
    # define the cost function
    a = tf.placeholder("float", [None, ACTIONS])
    y = tf.placeholder("float", [None])
    readout_action = tf.reduce_sum(tf.multiply(readout, a), reduction_indices=1) # tf.reduce_sum()里面参数axis和reduction_indices的作用是一样的
    cost = tf.reduce_mean(tf.square(y - readout_action)) # 这边的loss也是取的MSE
    train_step = tf.train.AdamOptimizer(1e-6).minimize(cost)

    # open up a game state to communicate with emulator
    game_state = game.GameState()

    # store the previous observations in replay memory
    D = deque()

    # printing
    a_file = open("logs_" + GAME + "/readout.txt", 'w')
    h_file = open("logs_" + GAME + "/hidden.txt", 'w')

    # get the first state by doing nothing and preprocess the image to 80x80x4
    do_nothing = np.zeros(ACTIONS)
    do_nothing[0] = 1
    x_t, r_0, terminal = game_state.frame_step(do_nothing)
    # 对图像进行色彩和大小的转变
    # cv2.resize(image, (宽,高))
    # cv2.cvtColor(image, color_type)
    # cv2.threshold(image(src), thresh(阈值,取值范围是0~255), maxval(填充色,(0~255)), type(阈值类型)),cv2.TRESH_BINARY:黑白二值),大于1的置为255,小于1的置为0
    x_t = cv2.cvtColor(cv2.resize(x_t, (80, 80)), cv2.COLOR_BGR2GRAY)
    ret, x_t = cv2.threshold(x_t,1,255,cv2.THRESH_BINARY)
    s_t = np.stack((x_t, x_t, x_t, x_t), axis=2) # 新增了一个维度,将同一个图片,stack4层,作为四个通道,数亿输出的s_t的shape是(80,80,4)

    # saving and loading networks
    saver = tf.train.Saver(max_to_keep=2)
    sess.run(tf.initialize_all_variables())
    checkpoint = tf.train.get_checkpoint_state("saved_networks")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")

    # start training
    epsilon = INITIAL_EPSILON
    t = 0
    while "flappy bird" != "angry bird":
    # for _ in range(epochs):
        # while not terminal:
        # choose an action epsilon greedily
        readout_t = readout.eval(feed_dict={s : [s_t]})[0]
        a_t = np.zeros([ACTIONS])
        action_index = 0
        if t % FRAME_PER_ACTION == 0:
            if random.random() <= epsilon:
                print("----------Random Action----------")
                action_index = random.randrange(ACTIONS)
                a_t[random.randrange(ACTIONS)] = 1
            else:
                action_index = np.argmax(readout_t)
                a_t[action_index] = 1
        else:
            a_t[0] = 1 # do nothing

        # scale down epsilon
        if epsilon > FINAL_EPSILON and t > OBSERVE:
            epsilon -= (INITIAL_EPSILON - FINAL_EPSILON) / EXPLORE

        # run the selected action and observe next state and reward
        x_t1_colored, r_t, terminal = game_state.frame_step(a_t)
        x_t1 = cv2.cvtColor(cv2.resize(x_t1_colored, (80, 80)), cv2.COLOR_BGR2GRAY)
        ret, x_t1 = cv2.threshold(x_t1, 1, 255, cv2.THRESH_BINARY)
        x_t1 = np.reshape(x_t1, (80, 80, 1))
        #s_t1 = np.append(x_t1, s_t[:,:,1:], axis = 2)
        s_t1 = np.append(x_t1, s_t[:, :, :3], axis=2) #每一次删除原来的输入的最后一个图片,保持输入为(80,80,4)

        # store the transition in D
        D.append((s_t, a_t, r_t, s_t1, terminal))
        if len(D) > REPLAY_MEMORY: # 为什么这里不直接定义好maxlen=REPLAY_MEMORY呢?
            D.popleft()

        # only train if done observing
        if t > OBSERVE:
            # sample a minibatch to train on
            minibatch = random.sample(D, BATCH)

            # get the batch variables
            s_j_batch = [d[0] for d in minibatch]
            a_batch = [d[1] for d in minibatch]
            r_batch = [d[2] for d in minibatch]
            s_j1_batch = [d[3] for d in minibatch]

            y_batch = []
            readout_j1_batch = readout.eval(feed_dict = {s : s_j1_batch})
            for i in range(0, len(minibatch)):
                terminal = minibatch[i][4]
                # if terminal, only equals reward
                if terminal:
                    y_batch.append(r_batch[i])
                else:
                    y_batch.append(r_batch[i] + GAMMA * np.max(readout_j1_batch[i]))

            # perform gradient step
            train_step.run(feed_dict = {
                y : y_batch,
                a : a_batch,
                s : s_j_batch}
            )

        # update the old values
        s_t = s_t1
        t += 1

        # save progress every 10000 iterations
        if t % 10000 == 0:
            saver.save(sess, 'saved_networks/' + GAME + '-dqn', global_step = t)

        # print info
        state = ""
        if t <= OBSERVE:
            state = "observe"
        elif t > OBSERVE and t <= OBSERVE + EXPLORE:
            state = "explore"
        else:
            state = "train"

        print("TIMESTEP", t, "/ STATE", state, \
            "/ EPSILON", epsilon, "/ ACTION", action_index, "/ REWARD", r_t, \
            "/ Q_MAX %e" % np.max(readout_t))
        # write info to files

        if t % 10000 <= 100:
            a_file.write(",".join([str(x) for x in readout_t]) + '\n')
            h_file.write(",".join([str(x) for x in h_fc1.eval(feed_dict={s:[s_t]})[0]]) + '\n')
            cv2.imwrite("logs_tetris/frame" + str(t) + ".png", x_t1)


def playGame():
    sess = tf.InteractiveSession() # 这里使用了InteravtiveSession,相当于with tf.Session() as sess.这样字可以直接运行op.run() 或者tensor.eval()而不需要传递sess ,否则应该是sess.run(op)
    s, readout, h_fc1 = createNetwork()
    trainNetwork(s, readout, h_fc1, sess)

def main():
    playGame()

if __name__ == "__main__":
    main()


#24

这位同学,你的学习效率真高,赞 :+1:


#25

tensorflow lstm c_state, m_state = cell_state©, hidden_state(H)(output state)


#26

最近在做CS231n和CS229n的课后作业题.
发现CS229n最后一个reinforcement作业在用迭代处理和矩阵直接计算的有意思的现象.

"""
CS 229 Machine Learning, Fall 2017
Problem Set 4
Question: Reinforcement Learning: The inverted pendulum
Author: Sanyam Mehra, sanyam@stanford.edu
"""
from __future__ import division, print_function
from math import sin, cos, pi
import matplotlib.pyplot as plt
import matplotlib.patches as patches

class CartPole:
    def __init__(self, physics):
        self.physics = physics
        self.mass_cart = 1.0
        self.mass_pole = 0.3
        self.mass = self.mass_cart + self.mass_pole
        self.length = 0.7 # actually half the pole length
        self.pole_mass_length = self.mass_pole * self.length

    def simulate(self, action, state_tuple):
        """
        Simulation dynamics of the cart-pole system

        Parameters
        ----------
        action : int
            Action represented as 0 or 1
        state_tuple : tuple
            Continuous vector of x, x_dot, theta, theta_dot

        Returns
        -------
        new_state : tuple
            Updated state vector of new_x, new_x_dot, nwe_theta, new_theta_dot
        """
        x, x_dot, theta, theta_dot = state_tuple
        costheta, sintheta = cos(theta), sin(theta)
        # costheta, sintheta = cos(theta * 180 / pi), sin(theta * 180 / pi)

        # calculate force based on action
        force = self.physics.force_mag if action > 0 else (-1 * self.physics.force_mag)

        # intermediate calculation
        temp = (force + self.pole_mass_length * theta_dot * theta_dot * sintheta) / self.mass
        theta_acc = (self.physics.gravity * sintheta - temp * costheta) / (self.length * (4/3 - self.mass_pole * costheta * costheta / self.mass))

        x_acc = temp - self.pole_mass_length * theta_acc * costheta / self.mass

        # return new state variable using Euler's method
        new_x = x + self.physics.tau * x_dot
        new_x_dot = x_dot + self.physics.tau * x_acc
        new_theta = theta + self.physics.tau * theta_dot
        new_theta_dot = theta_dot + self.physics.tau * theta_acc
        new_state = (new_x, new_x_dot, new_theta, new_theta_dot)

        return new_state

    def get_state(self, state_tuple):
        """
        Discretizes the continuous state vector. The current discretization
        divides x into 3, x_dot into 3, theta into 6 and theta_dot into 3
        categories. A finer discretization produces a larger state space
        but allows for a better policy

        Parameters
        ----------
        state_tuple : tuple
            Continuous vector of x, x_dot, theta, theta_dot

        Returns
        -------
        state : int
            Discretized state value
        """
        x, x_dot, theta, theta_dot = state_tuple
        # parameters for state discretization in get_state
        # convert degrees to radians
        one_deg = pi / 180
        six_deg = 6 * pi / 180
        twelve_deg = 12 * pi / 180
        fifty_deg = 50 * pi / 180

        total_states = 163
        state = 0

        if x < -2.4 or x > 2.4 or theta < -twelve_deg or theta > twelve_deg:
            state = total_states - 1 # to signal failure
        else:
            # x: 3 categories
            if x < -1.5:
                state = 0
            elif x < 1.5:
                state = 1
            else:
                state = 2
            # x_dot: 3 categories
            if x_dot < -0.5:
                pass
            elif x_dot < 0.5:
                state += 3
            else:
                state += 6
            # theta: 6 categories
            if theta < -six_deg:
                pass
            elif theta < -one_deg:
                state += 9
            elif theta < 0:
                state += 18
            elif theta < one_deg:
                state += 27
            elif theta < six_deg:
                state += 36
            else:
                state += 45
            # theta_dot: 3 categories
            if theta_dot < -fifty_deg:
                pass
            elif theta_dot < fifty_deg:
                state += 54
            else:
                state += 108
        # state += 1 # converting from MATLAB 1-indexing to 0-indexing
        return state

    def show_cart(self, state_tuple, pause_time):
        """
        Given the `state_tuple`, displays the cart-pole system.

        Parameters
        ----------
        state_tuple : tuple
            Continuous vector of x, x_dot, theta, theta_dot
        pause_time : float
            Time delay in seconds

        Returns
        -------
        """
        x, x_dot, theta, theta_dot = state_tuple
        X = [x, x + 4*self.length * sin(theta)]
        Y = [0, 4*self.length * cos(theta)]
        plt.close('all')
        fig, ax = plt.subplots(1)
        plt.ion()
        ax.set_xlim(-3, 3)
        ax.set_ylim(-0.5, 3.5)
        ax.plot(X, Y)
        cart = patches.Rectangle((x - 0.4, -0.25), 0.8, 0.25,
                        linewidth=1, edgecolor='k', facecolor='cyan')
        base = patches.Rectangle((x - 0.01, -0.5), 0.02, 0.25,
                        linewidth=1, edgecolor='k', facecolor='r')
        ax.add_patch(cart)
        ax.add_patch(base)
        x_dot_str, theta_str, theta_dot_str = '\\dot{x}', '\\theta', '\\dot{\\theta}'
        ax.set_title('x: %.3f, $%s$: %.3f, $%s$: %.3f, $%s$: %.3f'\
                                %(x, x_dot_str, x_dot, theta_str, theta, theta_dot_str, x))
        plt.show()
        plt.pause(pause_time)

class Physics:
    gravity = 9.8
    force_mag = 10.0
    tau = 0.02 # seconds between state updates
# -*-coding:utf-8 -*-
"""
CS 229 Machine Learning, Fall 2017
Problem Set 4
Question: Reinforcement Learning: The inverted pendulum
Author: Sanyam Mehra, sanyam@stanford.edu
"""
from __future__ import division, print_function
from cart_pole import CartPole, Physics
import matplotlib.pyplot as plt
import numpy as np
from scipy.signal import lfilter

"""
Parts of the code (cart and pole dynamics, and the state
discretization) are inspired from code available at the RL repository
http://www-anw.cs.umass.edu/rlr/domains.html

This file controls the pole-balancing simulation. You only need to
write code in between places marked
###### BEGIN YOUR CODE ######
###### END YOUR CODE ######

Briefly, the cart-pole system is described in `cart_pole.py`. The main
simulation loop in this file calls the `simulate()` function for
simulating the pole dynamics, `get_state()` for discretizing the
otherwise continuous state space in discrete states, and `show_cart()`
for display.

Some useful parameters are listed below:

`NUM_STATES`: Number of states in the discretized state space
You must assume that states are numbered 0 through `NUM_STATES` - 1. The
state numbered `NUM_STATES` - 1 (the last one) is a special state that
marks the state when the pole has been judged to have fallen (or when
the cart is out of bounds). However, you should NOT treat this state
any differently in your code. Any distinctions you need to make between
states should come automatically from your learning algorithm.

After each simulation cycle, you are supposed to update the transition
counts and rewards observed. However, you should not change either
your value function or the transition probability matrix at each
cycle.

Whenever the pole falls, a section of your code below will be
executed. At this point, you must use the transition counts and reward
observations that you have gathered to generate a new model for the MDP
(i.e. transition probabilities and state rewards). After that, you
must use value iteration to get the optimal value function for this MDP
model.

`TOLERANCE`: Controls the convergence criteria for each value iteration
run. In value iteration, you can assume convergence when the maximum
absolute change in the value function at any state in an iteration
becomes lower than `TOLERANCE.

You need to write code that chooses the best action according
to your current value function, and the current model of the MDP. The
action must be either 0 or 1 (corresponding to possible directions of
pushing the cart)

Finally, we assume that the simulation has converged when
`NO_LEARNING_THRESHOLD` consecutive value function computations all
converged within one value function iteration. Intuitively, it seems
like there will be little learning after this, so we end the simulation
here, and say the overall algorithm has converged.


Learning curves can be generated by calling a code snippet at the end
(it assumes that the learning was just executed, and the array
`time_steps_to_failure` that records the time for which the pole was
balanced before each failure are in memory). `num_failures` is a variable
that stores the number of failures (pole drops / cart out of bounds)
till now.

Other parameters in the code are described below:

`GAMMA`: Discount factor to be used

The following parameters control the simulation display; you dont
really need to know about them:

`pause_time`: Controls the pause between successive frames of the
display. Higher values make your simulation slower.
`min_trial_length_to_start_display`: Allows you to start the display only
after the pole has been successfully balanced for at least this many
trials. Setting this to zero starts the display immediately. Choosing a
reasonably high value (around 100) can allow you to rush through the
initial learning quickly, and start the display only after the
performance is reasonable.
"""


# Simulation parameters
pause_time = 0.0001
min_trial_length_to_start_display = 100
display_started = min_trial_length_to_start_display == 0

NUM_STATES = 163
NUM_ACTIONS = 2
GAMMA = 0.995
TOLERANCE = 0.01
NO_LEARNING_THRESHOLD = 20

# Time cycle of the simulation
time = 0

# These variables perform bookkeeping (how many cycles was the pole
# balanced for before it fell). Useful for plotting learning curves.
time_steps_to_failure = []
num_failures = 0
time_at_start_of_current_trial = 0

# You should reach convergence well before this
max_failures = 500

# Initialize a cart pole
cart_pole = CartPole(Physics())

# Starting `state_tuple` is (0, 0, 0, 0)
# x, x_dot, theta, theta_dot represents the actual continuous state vector
x, x_dot, theta, theta_dot = 0.0, 0.0, 0.0, 0.0
state_tuple = (x, x_dot, theta, theta_dot)

# `state` is the number given to this state, you only need to consider
# this representation of the state
state = cart_pole.get_state(state_tuple)
# if min_trial_length_to_start_display == 0 or display_started == 1:
#     cart_pole.show_cart(state_tuple, pause_time)

# Perform all your initializations here:
# Assume no transitions or rewards have been observed.
# Initialize the value function array to small random values (0 to 0.10,
# say).
# Initialize the transition probabilities uniformly (ie, probability of
# transitioning for state x to state y using action a is exactly
# 1/NUM_STATES).
# Initialize all state rewards to zero.

###### BEGIN YOUR CODE ######
# TODO:
# 2代表NUM_ACTIONS
transition_counts = np.zeros((NUM_STATES, NUM_STATES,NUM_ACTIONS)) 
transition_probs = np.ones((NUM_STATES, NUM_STATES, NUM_ACTIONS))/ NUM_STATES
reward_counts = np.zeros((NUM_STATES, 2))
reward = np.zeros(NUM_STATES)
value = np.random.rand(NUM_STATES) * 0.1
# raise NotImplementedError('Initializations not implemented')
###### END YOUR CODE ######

# This is the criterion to end the simulation.
# You should change it to terminate when the previous
# 'NO_LEARNING_THRESHOLD' consecutive value function computations all
# converged within one value function iteration. Intuitively, it seems
# like there will be little learning after this, so end the simulation
# here, and say the overall algorithm has converged.

consecutive_no_learning_trials = 0
while consecutive_no_learning_trials < NO_LEARNING_THRESHOLD:

    # Write code to choose action (0 or 1).
    # This action choice algorithm is just for illustration. It may
    # convince you that reinforcement learning is nice for control
    # problems!Replace it with your code to choose an action that is
    # optimal according to the current value function, and the current MDP
    # model.
    ###### BEGIN YOUR CODE ######
    # TODO:
    score1 = transition_probs[state, :, 0].dot(value) # action 1
    score2 = transition_probs[state, :, 1].dot(value) # action 2
    if score1 > score2:
        action = 0
    elif score2 > score1:
        action = 1
    else:
        if np.random.uniform() < 0.5:
            action = 0
        else:
            action = 1
    # raise NotImplementedError('Action choice not implemented')
    # action = 0 if np.random.uniform() < 0.5 else 1
    ###### END YOUR CODE ######

    # Get the next state by simulating the dynamics
    state_tuple = cart_pole.simulate(action, state_tuple)
    # x, x_dot, theta, theta_dot = state_tuple

    # Increment simulation time
    time = time + 1

    # Get the state number corresponding to new state vector
    new_state = cart_pole.get_state(state_tuple)
    # if display_started == 1:
    #     cart_pole.show_cart(state_tuple, pause_time)

    # reward function to use - do not change this!
    if new_state == NUM_STATES - 1:
        R = -1
    else:
        R = 0

    # Perform model updates here.
    # A transition from `state` to `new_state` has just been made using
    # `action`. The reward observed in `new_state` (note) is `R`.
    # Write code to update your statistics about the MDP i.e. the
    # information you are storing on the transitions and on the rewards
    # observed. Do not change the actual MDP parameters, except when the
    # pole falls (the next if block)!

    ###### BEGIN YOUR CODE ######
    # TODO:
    # raise NotImplementedError('Update T and R not implemented')
    # record the number of times `state, action, new_state` occurs
    # record the rewards for every `new_state`
    # record the number of time `new_state` was reached
    transition_counts[state, new_state, action] += 1
    reward_counts[new_state, 0] += R
    reward_counts[new_state, 1] += 1 
    ###### END YOUR CODE ######

    # Recompute MDP model whenever pole falls
    # Compute the value function V for the new model
    if new_state == NUM_STATES - 1:

        # Update MDP model using the current accumulated statistics about the
        # MDP - transitions and rewards.
        # Make sure you account for the case when a state-action pair has never
        # been tried before, or the state has never been visited before. In that
        # case, you must not change that component (and thus keep it at the
        # initialized uniform distribution).

        ###### BEGIN YOUR CODE ######
        # TODO:
        dens = transition_counts.sum(axis = 1)
        dens = dens[:, np.newaxis, :]
        dens = np.tile(dens, (1, NUM_STATES, 1))
        mask = (dens > 0)
        transition_probs[mask] = transition_counts[mask] / dens[mask]
        '''
        for action in range(NUM_ACTIONS):
            for state in range(NUM_STATES):
                den = transition_counts[state, :, action].sum()
                if den > 0:
                    transition_probs[state, :, action] = transition_counts[state, :, action] / den
        '''
        mask = (reward_counts[:,1] > 0)
        reward[mask] = reward_counts[mask, 0] / reward_counts[mask, 1]
        print(reward)
        print(transition_probs)
        '''
        for state in range(NUM_STATES):
            den = reward_counts[state, 1]
            if den > 0:
                reward[state] = reward_counts[state, 0] / den
        '''
            
        # raise NotImplementedError('MDP  T and R update not implemented')
        ###### END YOUR CODE ######

        # Perform value iteration using the new estimated model for the MDP.
        # The convergence criterion should be based on `TOLERANCE` as described
        # at the top of the file.
        # If it converges within one iteration, you may want to update your
        # variable that checks when the whole simulation must end.

        ###### BEGIN YOUR CODE ######
        # TODO:
        iterations = 0
        value_prime = np.zeros(NUM_STATES)
        while True:
            iterations += 1
            # 这里弄了两种实现方式, 不知道为什么如果直接用new_value值作为中间项的话就无法收敛
            # value_prime = np.max((transition_probs * (value.reshape(-1,1))).sum(axis = 1), axis = 1)
            
            for state in range(NUM_STATES):
                value1 = transition_probs[state, :, 0].dot(value)
                value2 = transition_probs[state, :, 1].dot(value)
                # print(value1,value2)
                # print(np.maximum(value1, value2))
                # print(state)
                value_prime[state] = np.maximum(value1, value2)
            # print(new_value)
            
            new_value = reward + GAMMA * value_prime
            #print(new_value)
            #print(value)
            # print(reward)
            diff = np.max(np.abs(value - new_value))
            # print(np.abs(value - new_value))
            # print(diff)
            value = new_value
            # print(iterations)
            if diff < TOLERANCE:
                break 
                
        if iterations == 1:
            consecutive_no_learning_trials += 1
        else:
            consecutive_no_learning_trials = 0
        # raise NotImplementedError('Value iteration choice not implemented')
        ###### END YOUR CODE ######

    # Do NOT change this code: Controls the simulation, and handles the case
    # when the pole fell and the state must be reinitialized.
    if new_state == NUM_STATES - 1:
        num_failures += 1
        if num_failures >= max_failures:
            break
        print('[INFO] Failure number {}'.format(num_failures))
        time_steps_to_failure.append(time - time_at_start_of_current_trial)
        # time_steps_to_failure[num_failures] = time - time_at_start_of_current_trial
        time_at_start_of_current_trial = time

        if time_steps_to_failure[num_failures - 1] > min_trial_length_to_start_display:
            display_started = 1

        # Reinitialize state
        # x = 0.0
        x = -1.1 + np.random.uniform() * 2.2
        x_dot, theta, theta_dot = 0.0, 0.0, 0.0
        state_tuple = (x, x_dot, theta, theta_dot)
        state = cart_pole.get_state(state_tuple)
    else:
        state = new_state

# plot the learning curve (time balanced vs. trial)
log_tstf = np.log(np.array(time_steps_to_failure))
plt.plot(np.arange(len(time_steps_to_failure)), log_tstf, 'k')
window = 30
w = np.array([1/window for _ in range(window)])
weights = lfilter(w, 1, log_tstf)
x = np.arange(window//2, len(log_tstf) - window//2)
plt.plot(x, weights[window:len(log_tstf)], 'r--')
plt.xlabel('Num failures')
plt.ylabel('Num steps to failure')
plt.show()

这个是对每一个state分开求解的图片
image
https://github.com/luxuriance19/CS229_2017FALL/blob/master/assignment4/iteration.png
这个是矩阵运算的图片
image

实现出来的结果和matlab实现的结果也不一样,有点神奇.


#27

空洞卷积(dilated convolution, atrous convolution): 这里和普通的卷积的区别就在于,这里的卷积的卷积核对原始的输入图片的卷积是中间会有gap, kernel中间每个参数之间的距离是dilated-1
二维tf实现的是: tf.nn.atrous_conv2d
其余各种维度的实现是: tf.nn.convolution

因果卷积的实现可以参考下面这个blog:
https://medium.com/the-artificial-impostor/notes-understanding-tensorflow-part-3-7f6633fcc7c7


#28

许久不更,才发现群里面已经有最后一次的任务,先挖个坑


#29

1.GAN的变种:

从李宏毅的PPT上面的FID: smaller is better这篇论文的图片中可以看到GAN的种类有很多: MM GAN, NS GAN, LSGAN, WGAN, WGAN GP, DRAGAN, BEGAN
从课程中提到的有:
BiGAN: 训练了两个generator
TriGAN
conditional GAN:不仅要生成图片,还要给生成的东西加一个限制
cycle GAN: 训练两个generator和一个discriminator,防止中间的generator生成的数据忽略输入的要求.(Disco GAN, Dual GAN)
EBGAN: energy-based GAN
WGAN: earth mover’s distance: 比KL divergence更能够衡量两个分布之间的距离
fGAN: fenchel conjugate, f-divergence(这里主要的改进就是GAN的loss的改变)
starGAN: 可以生成多个domain的数据
infoGAN: 里面增加了auto-encoder的结构 VAEGAN: 也有一个auto-encoder的结构, generator就是decoder部分,由于有auto-encoder的结构,所以会让generator的学习比较简单

2.GAN原理解析

训练一个generator和一个discriminator,discriminator判断真实的图片为1,生成的图片为0.generator的训练的目的是为了骗过discriminator,让其将它生成的东西评分变高.训练流程: 训练discriminator一次或者多次,然后再训练generator.
算法流程:
学习discriminator的流程:

  1. 从数据库中抽样m个样本{x1,x2,…,xm}
  2. 从一个分布中采样噪声样本{z1,z2,…,zm}
  3. 获取generator的生成数据{x1’,x2’,…,xm’}, xi’=G(zi)
  4. 利用梯度上升来更新discriminator的参数theta来最大化loss值:
    V = 1/m(sum(logD(xi)+log(1-D(xi’))
    theta = theta + eta * grad(theta)
    学习generator的流程:
  5. 从一个分布中取样m个噪声样本{z1,z2,…,zm}
  6. 利用梯度下降来更新generator的参数theta(g)来最大化loss V
    V = 1/m*(sum(log(D(G(zi)))) theta(g) = theta(g) - eta * grad(theta(g))

3.DCGAN原理解析

模型结构上需要做如下几点变化:

将pooling层convolutions替代,其中,在discriminator上用strided convolutions替代,在generator上用fractional-strided convolutions替代。
在generator和discriminator上都使用batchnorm。
解决初始化差的问题
帮助梯度传播到每一层
防止generator把所有的样本都收敛到同一个点。
直接将BN应用到所有层会导致样本震荡和模型不稳定,通过在generator输出层和discriminator输入层不采用BN可以防止这种现象。
移除全连接层
global pooling增加了模型的稳定性,但伤害了收敛速度。 在generator的除了输出层外的所有层使用ReLU,输出层采用tanh。 在discriminator的所有层上使用LeakyReLU。 具体网络结构看完代码再来补充:

TO DO

参考文献:
https://blog.csdn.net/stdcoutzyx/article/details/53872121
https://blog.csdn.net/wspba/article/details/54730871

DCGAN介绍论文
http://bamos.github.io/2016/08/09/deep-completion/

下面记录一些博客上面的相关内容. 1.将图片做为概率分布的样本:

  1. contextual information: 通过图片的周围的像素点推断丢失的像素点
  2. perceptual information:填充进去的图像部分是平时肯呢个会从其他图片或者真实的情况中看到的"normal"的值.

DCGAN中用到了fractionally-strided convolutions

4.InfoGAN原理解析

generator给定两个输入, 一个是c, 一个是随机噪声.通过generator生成x, 生成的x通过classifier需要被classifier判断为c, 所以c肯定要对生成的x产生某些影响,所以说generator前被设为c,就使得其必须具有生成图片的某些特征.然后x后面还会连接一个discriminator, 这个classifier评价生成的图片.dicriminator和classifier共享网络的参数,除了最后一层不一样.
(这里只是针对课程上面提出的一些进行总结,但是并没有对inforGAN的结构自己取查看相关资料)

TO DO

VAE-GAN

encoder-generator(decoder)-discriminator, encoder和decoder最小化reconstruction error,discriminator出real,generatored和reconstructed的image 算法流程:
初始化encoder, decoder, discriminator
在每一轮迭代:

  1. 从数据库中采样M张image:x1,x2,…,xm
  2. 从encoder中生成M个code,zi’ = En(xi), z1’, z2’,…,zm’
  3. 从decoder中生成M张图片xi’ = De(zi’), x1’, x2’,…,xm’
  4. 从先验分布P(z)中采样M个codes,z1, z2, …,zm
  5. 从decoder当中采样M张image, xi" = De(zi)
  6. 更新encoder来降低||xi’ - xi||,降低KL(P(zi’|xi)||P(z))
  7. 更新decoder来降低||xi" - xi||,升高disciminator(xi’)和discriminator(xi")
  8. 更新discriminator增加discriminator(xi),降低disciminator(xi’)和discriminator(xi")

Bi-GAN

这里的encoder和decoder没有直接相连,而是利用discriminator将这两者相连, 给定输入, 由discriminator判断输入是来自于encoder还是来自于decoder.
算法流程:
初始化encoder En, decoder De, Discriminator Dis:
一轮迭代过程:

  1. 从数据库中采样M张图片,index是xi
  2. 从encoder中长生M codes, zi’. zi’=En(xi)
  3. 从先验分布P(z)采样M codes, zi
  4. 从decoder中抽样M个codes, xi’=De(zi)
  5. 更新discriminator增加Dis(xi, zi’), 降低Dis(xi’,zi)
  6. 更新encoder和decoder来降低Dis(xi, zi’), 增加Dis(xi’, zi)

5.GAN和VAE的区别

VAE在之前在CS231n的课程上,我觉着推导过程有点类似与EM算法,感觉整个实现过程就像是用神经网络实现了原始的EM算法的过程.
VAE的网络结构是一个encoder, 一个decoder,都是由神经网络构成.
输入经过NN encoder会产生两组输出,然后从normal distribution中取出一组数据,将这个数据和NN encoder中的一组数据相乘之后再与另外一组数据相加,作为加了扰动的NN decoder的输入,最小化重构误差,最后产生一个output
VAE产生的图片与真实的图片不一样,它是将encoder的输入加一个扰动之后再构造一个输出,网络的目的并不是为了取构造一张真实的图片.
GAN的目的就是为了让generator生成一张图片逼近真实的图片来骗过discriminator. 所以GAN生成的图片比VAE生成的图片真实.
从本质上来说, GAN和VAE优化方式不一样,loss的值不一样, 生成图片的目的也不一样.
(以上只是初步的结论,未经资料验证).

知乎机器之心的对比:

6、AAE原理解析

AAE(adversarial autoencoder) 将一个autoencoder编程一个generative model autoencoder的训练有两个目标, 一个是传统的重构误差,另外一个是对抗训练标准(与autoencoder潜在表达的aggregated posterior distribution到一个任意的先验分布).
encoder学习将一个数据分布转换成一个先验分布,decoder学习一个深度generative model将prior加到数据分布上面.

摘自博客(https://blog.csdn.net/yinruiyang94/article/details/77839048)
p(z)是任意的先验分布, q(z)聚合后验分布,q(z|x)encoding的分布,p(x|z)decoding分布,Pd(x)真实数据分布,p(x)模型分布

以下是模型的encode生成的分布函数

在AAE当中,判别器想要判断的data并不是图像,而是encode的向量z, 对应的real data和fake data分别有autoencoder中的encoder和一个随机概率(这里论文中选择了single gaussian, 10 gaussian mixture, swiss roll的概率分布), 最后的网络的生成的图片(也就是generator)就是之前的生成器G.
具体的模型训练步骤:

  1. autoencoder重构图像, 最小化reconstruction error让decoder能够从encoder生成的编码数据恢复出原始图像内容;
  2. generator和discriminator的对抗学习, 这里让判别器能够区分编码的向量是来自于encoder还是来自于p(z)的部分,然后训练autoencoder中的encoder部分来生成latent variable z更加接近p(z)来欺骗discriminator.

在下面这篇参考文献复现当中可以看出来, 对于生成的编码的可视化可以看出来,用MINST数据经过encoder的encoding vector的分布与输入的p(z)(原始的数据分布)基本一样, 所以说在博客当中说这种实现方法类似与聚类的算法.

前面的实现是无监督学习,在后面中也提到了监督学习的算法. 就是在AAE的discriminator不仅输入p(z),还增加一个one hot vector的输入, 这样就代表了样本的类别信息.

以下为博客描述:
在MNIST数据中,这个label就是图片所代表的数字,当然,仅仅为real data增加label显然是不够的,我们还需要考虑fake data,因此作者提出根据数据的label我们也可以伪造出对应的z,比如在采用10 gaussian mixture时,我们对于第零个gaussian产生的样本全部标为0,一号gaussian产生的样本标为1,以此类推,模型训练初始阶段,由于fake data总是可以与对应的label对应上,而real data编码z与label则并无这样严格的对应关系,因此随着训练迭代,q(z)与label也会慢慢产生这样的对应关系,下方是加入了label信息后产生的结果图(仍然是q(z)、p(z)),注意,此时模型收敛后各个类别样本的聚集状况显然比之前Unsupervised Learning任务中好很多。

这里这个部分real data是不是指输入的高斯分布, fake data指的是MINST的分布类型.

原始的高斯混合分布:
image
加入one hot vector之后得到的生成图片:
image

参考文献:
http://closure11.com/对抗自编码器:adversarial-autoencoders/


#30

train.py

import tensorflow as tf
import numpy as np
import model
import argparse
import pickle
from os.path import join
import h5py
from Utils import image_processing
import scipy.misc
import random
import json
import os
import shutil

def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('--z_dim', type=int, default=100,
					   help='Noise dimension')

	parser.add_argument('--t_dim', type=int, default=256,
					   help='Text feature dimension')

	parser.add_argument('--batch_size', type=int, default=64,
					   help='Batch Size')

	parser.add_argument('--image_size', type=int, default=64,
					   help='Image Size a, a x a')

	parser.add_argument('--gf_dim', type=int, default=64,
					   help='Number of conv in the first layer gen.')

	parser.add_argument('--df_dim', type=int, default=64,
					   help='Number of conv in the first layer discr.')

	parser.add_argument('--gfc_dim', type=int, default=1024,
					   help='Dimension of gen untis for for fully connected layer 1024')

	parser.add_argument('--caption_vector_length', type=int, default=2400,
					   help='Caption Vector Length')

	parser.add_argument('--data_dir', type=str, default="Data",
					   help='Data Directory')

	parser.add_argument('--learning_rate', type=float, default=0.0002,
					   help='Learning Rate')

	parser.add_argument('--beta1', type=float, default=0.5,
					   help='Momentum for Adam Update')

	parser.add_argument('--epochs', type=int, default=600,
					   help='Max number of epochs')

	parser.add_argument('--save_every', type=int, default=30,
					   help='Save Model/Samples every x iterations over batches')

	parser.add_argument('--resume_model', type=str, default=None,
                       help='Pre-Trained Model Path, to resume from')

	parser.add_argument('--data_set', type=str, default="flowers",
                       help='Dat set: MS-COCO, flowers')

	args = parser.parse_args()
	model_options = {
		'z_dim' : args.z_dim,
		't_dim' : args.t_dim,
		'batch_size' : args.batch_size,
		'image_size' : args.image_size,
		'gf_dim' : args.gf_dim,
		'df_dim' : args.df_dim,
		'gfc_dim' : args.gfc_dim,
		'caption_vector_length' : args.caption_vector_length
	}
	
	
	gan = model.GAN(model_options)
	input_tensors, variables, loss, outputs, checks = gan.build_model()
	
	d_optim = tf.train.AdamOptimizer(args.learning_rate, beta1 = args.beta1).minimize(loss['d_loss'], var_list=variables['d_vars'])
	g_optim = tf.train.AdamOptimizer(args.learning_rate, beta1 = args.beta1).minimize(loss['g_loss'], var_list=variables['g_vars'])
	
	sess = tf.InteractiveSession()
	tf.initialize_all_variables().run()
	
	saver = tf.train.Saver()
	if args.resume_model:
		saver.restore(sess, args.resume_model)
	
	loaded_data = load_training_data(args.data_dir, args.data_set)
	
	for i in range(args.epochs):
		'''
		在每个epoch
		1. update discriminator(一次)
		2. update generator(两次)
		'''
		batch_no = 0
		while batch_no*args.batch_size < loaded_data['data_length']:
			real_images, wrong_images, caption_vectors, z_noise, image_files = get_training_batch(batch_no, args.batch_size, 
				args.image_size, args.z_dim, args.caption_vector_length, 'train', args.data_dir, args.data_set, loaded_data)
			
			# DISCR UPDATE
			check_ts = [ checks['d_loss1'] , checks['d_loss2'], checks['d_loss3']]
			_, d_loss, gen, d1, d2, d3 = sess.run([d_optim, loss['d_loss'], outputs['generator']] + check_ts,
				feed_dict = {
					input_tensors['t_real_image'] : real_images,
					input_tensors['t_wrong_image'] : wrong_images,
					input_tensors['t_real_caption'] : caption_vectors,
					input_tensors['t_z'] : z_noise,
				})
			
			print("d1", d1)
			print("d2", d2)
			print("d3", d3)
			print("D", d_loss)
			
			# GEN UPDATE
			_, g_loss, gen = sess.run([g_optim, loss['g_loss'], outputs['generator']],
				feed_dict = {
					input_tensors['t_real_image'] : real_images,
					input_tensors['t_wrong_image'] : wrong_images,
					input_tensors['t_real_caption'] : caption_vectors,
					input_tensors['t_z'] : z_noise,
				})

			# GEN UPDATE TWICE, to make sure d_loss does not go to 0
			_, g_loss, gen = sess.run([g_optim, loss['g_loss'], outputs['generator']],
				feed_dict = {
					input_tensors['t_real_image'] : real_images,
					input_tensors['t_wrong_image'] : wrong_images,
					input_tensors['t_real_caption'] : caption_vectors,
					input_tensors['t_z'] : z_noise,
				})
			
			print("LOSSES", d_loss, g_loss, batch_no, i, len(loaded_data['image_list'])/ args.batch_size)
			batch_no += 1
			if (batch_no % args.save_every) == 0:
				print("Saving Images, Model")
				save_for_vis(args.data_dir, real_images, gen, image_files)
				save_path = saver.save(sess, "Data/Models/latest_model_{}_temp.ckpt".format(args.data_set))
		if i%5 == 0:
			save_path = saver.save(sess, "Data/Models/model_after_{}_epoch_{}.ckpt".format(args.data_set, i))

def load_training_data(data_dir, data_set):
	if data_set == 'flowers':
		h = h5py.File(join(data_dir, 'flower_tv.hdf5'))
		flower_captions = {}
		for ds in h.items():
			flower_captions[ds[0]] = np.array(ds[1])
		image_list = [key for key in flower_captions]
		image_list.sort()

		img_75 = int(len(image_list)*0.75)
		training_image_list = image_list[0:img_75]
		random.shuffle(training_image_list)
		
		return {
			'image_list' : training_image_list,
			'captions' : flower_captions,
			'data_length' : len(training_image_list)
		}
	
	else:
		with open(join(data_dir, 'meta_train.pkl')) as f:
			meta_data = pickle.load(f)
		# No preloading for MS-COCO
		return meta_data

def save_for_vis(data_dir, real_images, generated_images, image_files):
	
	shutil.rmtree( join(data_dir, 'samples') )
	os.makedirs( join(data_dir, 'samples') )

	for i in range(0, real_images.shape[0]):
		real_image_255 = np.zeros( (64,64,3), dtype=np.uint8)
		real_images_255 = (real_images[i,:,:,:])
		# save an array as a image
		scipy.misc.imsave( join(data_dir, 'samples/{}_{}.jpg'.format(i, image_files[i].split('/')[-1] )) , real_images_255)

		fake_image_255 = np.zeros( (64,64,3), dtype=np.uint8)
		fake_images_255 = (generated_images[i,:,:,:])
		scipy.misc.imsave(join(data_dir, 'samples/fake_image_{}.jpg'.format(i)), fake_images_255)


def get_training_batch(batch_no, batch_size, image_size, z_dim, 
	caption_vector_length, split, data_dir, data_set, loaded_data = None):
	if data_set == 'mscoco':
		with h5py.File( join(data_dir, 'tvs/'+split + '_tvs_' + str(batch_no))) as hf:
			caption_vectors = np.array(hf.get('tv'))
			caption_vectors = caption_vectors[:,0:caption_vector_length]
		with h5py.File( join(data_dir, 'tvs/'+split + '_tv_image_id_' + str(batch_no))) as hf:
			image_ids = np.array(hf.get('tv'))

		real_images = np.zeros((batch_size, 64, 64, 3))
		wrong_images = np.zeros((batch_size, 64, 64, 3))
		
		image_files = []
		for idx, image_id in enumerate(image_ids):
			image_file = join(data_dir, '%s2014/COCO_%s2014_%.12d.jpg'%(split, split, image_id) )
			image_array = image_processing.load_image_array(image_file, image_size)
			real_images[idx,:,:,:] = image_array
			image_files.append(image_file)
		
		# TODO>> As of Now, wrong images are just shuffled real images.
		first_image = real_images[0,:,:,:]
		for i in range(0, batch_size):
			if i < batch_size - 1:
				wrong_images[i,:,:,:] = real_images[i+1,:,:,:]
			else:
				wrong_images[i,:,:,:] = first_image

		z_noise = np.random.uniform(-1, 1, [batch_size, z_dim])


		return real_images, wrong_images, caption_vectors, z_noise, image_files

	if data_set == 'flowers':
		real_images = np.zeros((batch_size, 64, 64, 3))
		wrong_images = np.zeros((batch_size, 64, 64, 3))
		captions = np.zeros((batch_size, caption_vector_length))

		cnt = 0
		image_files = []
		for i in range(batch_no * batch_size, batch_no * batch_size + batch_size):
			idx = i % len(loaded_data['image_list'])
			image_file =  join(data_dir, 'flowers/jpg/'+loaded_data['image_list'][idx])
			image_array = image_processing.load_image_array(image_file, image_size)
			real_images[cnt,:,:,:] = image_array
			
			# Improve this selection of wrong image
			wrong_image_id = random.randint(0,len(loaded_data['image_list'])-1)
			wrong_image_file =  join(data_dir, 'flowers/jpg/'+loaded_data['image_list'][wrong_image_id])
			wrong_image_array = image_processing.load_image_array(wrong_image_file, image_size)
			wrong_images[cnt, :,:,:] = wrong_image_array

			random_caption = random.randint(0,4)
			captions[cnt,:] = loaded_data['captions'][ loaded_data['image_list'][idx] ][ random_caption ][0:caption_vector_length]
			image_files.append( image_file )
			cnt += 1

		z_noise = np.random.uniform(-1, 1, [batch_size, z_dim])
		return real_images, wrong_images, captions, z_noise, image_files

if __name__ == '__main__':
	main()

model.py
模型的主体结构

import tensorflow as tf
from Utils import ops

class GAN:
	'''
	OPTIONS
	z_dim : Noise dimension 100
	t_dim : Text feature dimension 256
	image_size : Image Dimension 64
	gf_dim : Number of conv in the first layer generator 64
	df_dim : Number of conv in the first layer discriminator 64
	gfc_dim : Dimension of gen untis for for fully connected layer 1024
	caption_vector_length : Caption Vector Length 2400
	batch_size : Batch Size 64
	'''
	def __init__(self, options):
		self.options = options

		self.g_bn0 = ops.batch_norm(name='g_bn0')
		self.g_bn1 = ops.batch_norm(name='g_bn1')
		self.g_bn2 = ops.batch_norm(name='g_bn2')
		self.g_bn3 = ops.batch_norm(name='g_bn3')

		self.d_bn1 = ops.batch_norm(name='d_bn1')
		self.d_bn2 = ops.batch_norm(name='d_bn2')
		self.d_bn3 = ops.batch_norm(name='d_bn3')
		self.d_bn4 = ops.batch_norm(name='d_bn4')


	def build_model(self):
		img_size = self.options['image_size']
		t_real_image = tf.placeholder('float32', [self.options['batch_size'],img_size, img_size, 3 ], name = 'real_image')
		t_wrong_image = tf.placeholder('float32', [self.options['batch_size'],img_size, img_size, 3 ], name = 'wrong_image')
		t_real_caption = tf.placeholder('float32', [self.options['batch_size'], self.options['caption_vector_length']], name = 'real_caption_input')
		t_z = tf.placeholder('float32', [self.options['batch_size'], self.options['z_dim']]) # noise input placeholder

		fake_image = self.generator(t_z, t_real_caption)
		
		disc_real_image, disc_real_image_logits   = self.discriminator(t_real_image, t_real_caption)
		disc_wrong_image, disc_wrong_image_logits   = self.discriminator(t_wrong_image, t_real_caption, reuse = True)
		disc_fake_image, disc_fake_image_logits   = self.discriminator(fake_image, t_real_caption, reuse = True)

		'''
		g_loss: 想要让fake_image的得分越高越好
		'''
		g_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_fake_image_logits, tf.ones_like(disc_fake_image)))

		'''
		计算3个loss值:
		1. 真实图片和真实的文本想要让discriminator判断得离1越近越好
		2. 错误图片和真实的文本想要让discriminator判断得离0越近越好
		3. 生成图片和真实的文本想要让discriminator判断得离0越近越好
		'''
		d_loss1 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_real_image_logits, tf.ones_like(disc_real_image)))
		d_loss2 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_wrong_image_logits, tf.zeros_like(disc_wrong_image)))
		d_loss3 = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(disc_fake_image_logits, tf.zeros_like(disc_fake_image)))

		d_loss = d_loss1 + d_loss2 + d_loss3

		t_vars = tf.trainable_variables()
		d_vars = [var for var in t_vars if 'd_' in var.name]
		g_vars = [var for var in t_vars if 'g_' in var.name]

		input_tensors = {
			't_real_image' : t_real_image,
			't_wrong_image' : t_wrong_image,
			't_real_caption' : t_real_caption,
			't_z' : t_z
		}

		variables = {
			'd_vars' : d_vars,
			'g_vars' : g_vars
		}

		loss = {
			'g_loss' : g_loss,
			'd_loss' : d_loss
		}

		outputs = {
			'generator' : fake_image
		}

		checks = {
			'd_loss1': d_loss1,
			'd_loss2': d_loss2,
			'd_loss3' : d_loss3,
			'disc_real_image_logits' : disc_real_image_logits,
			'disc_wrong_image_logits' : disc_wrong_image,
			'disc_fake_image_logits' : disc_fake_image_logits
		}
		
		return input_tensors, variables, loss, outputs, checks

	def build_generator(self):
		img_size = self.options['image_size']
		t_real_caption = tf.placeholder('float32', [self.options['batch_size'], self.options['caption_vector_length']], name = 'real_caption_input')
		t_z = tf.placeholder('float32', [self.options['batch_size'], self.options['z_dim']])
		fake_image = self.sampler(t_z, t_real_caption)
		
		input_tensors = {
			't_real_caption' : t_real_caption,
			't_z' : t_z
		}
		
		outputs = {
			'generator' : fake_image
		}

		return input_tensors, outputs

	# Sample Images for a text embedding
	def sampler(self, t_z, t_text_embedding):
		'''
		这里的网络结构和generator相同,通过噪声向量和文字描述,经过训练好的generator生成一张图片
		'''
		tf.get_variable_scope().reuse_variables()
		
		s = self.options['image_size']
		s2, s4, s8, s16 = int(s/2), int(s/4), int(s/8), int(s/16)
		
		reduced_text_embedding = ops.lrelu( ops.linear(t_text_embedding, self.options['t_dim'], 'g_embedding') )
		z_concat = tf.concat(1, [t_z, reduced_text_embedding])
		z_ = ops.linear(z_concat, self.options['gf_dim']*8*s16*s16, 'g_h0_lin')
		h0 = tf.reshape(z_, [-1, s16, s16, self.options['gf_dim'] * 8])
		h0 = tf.nn.relu(self.g_bn0(h0, train = False))
		
		h1 = ops.deconv2d(h0, [self.options['batch_size'], s8, s8, self.options['gf_dim']*4], name='g_h1')
		h1 = tf.nn.relu(self.g_bn1(h1, train = False))
		
		h2 = ops.deconv2d(h1, [self.options['batch_size'], s4, s4, self.options['gf_dim']*2], name='g_h2')
		h2 = tf.nn.relu(self.g_bn2(h2, train = False))
		
		h3 = ops.deconv2d(h2, [self.options['batch_size'], s2, s2, self.options['gf_dim']*1], name='g_h3')
		h3 = tf.nn.relu(self.g_bn3(h3, train = False))
		
		h4 = ops.deconv2d(h3, [self.options['batch_size'], s, s, 3], name='g_h4')
		
		return (tf.tanh(h4)/2. + 0.5)

	# GENERATOR IMPLEMENTATION based on : https://github.com/carpedm20/DCGAN-tensorflow/blob/master/model.py
	def generator(self, t_z, t_text_embedding):
		'''
		input: t_z: 2-d array [batch_size, noise_dimension]
			   t_text_embedding: 2-d array [batch_size, caption vector]
		网络结构:
		text_embedding经过一层线性变换得到一个text_embedding,然后和噪声向量concat到一起.
		经过一个线性变换,将这两个concat的向量转换到第一层的conv-transpose的所有维度.

		经过四层的conv-transpose,每一层经过batch_norm之后再经过relu,送到下一层的conv-transpose作为输入.
		经过第四层之后,输出维度为图片的维度,最后再经过一个非线性tanh变换,再进行移位操作,让输出在(0,1)之间, 即作为生成图片的输出
		'''
		# image size is a number, type is int
		s = self.options['image_size']
		s2, s4, s8, s16 = int(s/2), int(s/4), int(s/8), int(s/16)

		# caption_vector to text_embedding
		reduced_text_embedding = ops.lrelu( ops.linear(t_text_embedding, self.options['t_dim'], 'g_embedding') )
		z_concat = tf.concat(1, [t_z, reduced_text_embedding])

		# gf_dim : number of conv in the first layer of generator
		z_ = ops.linear(z_concat, self.options['gf_dim']*8*s16*s16, 'g_h0_lin')
		h0 = tf.reshape(z_, [-1, s16, s16, self.options['gf_dim'] * 8])
		h0 = tf.nn.relu(self.g_bn0(h0))
		
		h1 = ops.deconv2d(h0, [self.options['batch_size'], s8, s8, self.options['gf_dim']*4], name='g_h1')
		h1 = tf.nn.relu(self.g_bn1(h1))
		
		h2 = ops.deconv2d(h1, [self.options['batch_size'], s4, s4, self.options['gf_dim']*2], name='g_h2')
		h2 = tf.nn.relu(self.g_bn2(h2))
		
		h3 = ops.deconv2d(h2, [self.options['batch_size'], s2, s2, self.options['gf_dim']*1], name='g_h3')
		h3 = tf.nn.relu(self.g_bn3(h3))
		
		h4 = ops.deconv2d(h3, [self.options['batch_size'], s, s, 3], name='g_h4')
		
		return (tf.tanh(h4)/2. + 0.5)

	# DISCRIMINATOR IMPLEMENTATION based on : https://github.com/carpedm20/DCGAN-tensorflow/blob/master/model.py
	def discriminator(self, image, t_text_embedding, reuse=False):
		'''
		inputs: image: 输入图片的维度
		        t_text_embedding: 输入的描述embedding

		经过四层卷积, 卷积的padding操作都是'same', stride=2, 所以卷积出来每一次都是宽和高都是除以2
		所以经过四层卷积之后,和generator的输入的noise的维度一样.

		对text的处理: 这里对text_embedding进行线性变换,将维度和generator的text的输入保持一致
		对text维度扩维, 变换成和卷积之后图片相同的维度

		将图片和text_embedding的最后一个维度concat到一起
		然后对这个tensor改变最后一个维度的输出, 再将其变成一维向量,再进行线性变换之后转换到一维向量

		在这个一维向量经过sigmoid函数作为概率的prediction
		'''
		if reuse:
			tf.get_variable_scope().reuse_variables()

		# numbers of conv of the first layer of the discriminator
		h0 = ops.lrelu(ops.conv2d(image, self.options['df_dim'], name = 'd_h0_conv')) #32
		h1 = ops.lrelu( self.d_bn1(ops.conv2d(h0, self.options['df_dim']*2, name = 'd_h1_conv'))) #16
		h2 = ops.lrelu( self.d_bn2(ops.conv2d(h1, self.options['df_dim']*4, name = 'd_h2_conv'))) #8
		h3 = ops.lrelu( self.d_bn3(ops.conv2d(h2, self.options['df_dim']*8, name = 'd_h3_conv'))) #4
		
		# ADD TEXT EMBEDDING TO THE NETWORK
		# t_dim: text feature dimensions
		reduced_text_embeddings = ops.lrelu(ops.linear(t_text_embedding, self.options['t_dim'], 'd_embedding'))
		reduced_text_embeddings = tf.expand_dims(reduced_text_embeddings,1)
		reduced_text_embeddings = tf.expand_dims(reduced_text_embeddings,2)
		tiled_embeddings = tf.tile(reduced_text_embeddings, [1,4,4,1], name='tiled_embeddings')
		
		h3_concat = tf.concat( 3, [h3, tiled_embeddings], name='h3_concat')
		h3_new = ops.lrelu( self.d_bn4(ops.conv2d(h3_concat, self.options['df_dim']*8, 1,1,1,1, name = 'd_h3_conv_new'))) #4
		
		h4 = ops.linear(tf.reshape(h3_new, [self.options['batch_size'], -1]), 1, 'd_h3_lin')
		
		return tf.nn.sigmoid(h4), h4

ops.py
定义模型的主要的操作的模型

# RESUED CODE FROM https://github.com/carpedm20/DCGAN-tensorflow/blob/master/ops.py
import math
import numpy as np 
import tensorflow as tf

from tensorflow.python.framework import ops


class batch_norm(object):
	"""Code modification of http://stackoverflow.com/a/33950177"""
	def __init__(self, epsilon=1e-5, momentum = 0.9, name="batch_norm"):
		with tf.variable_scope(name):
			self.epsilon = epsilon
			self.momentum = momentum

			self.ema = tf.train.ExponentialMovingAverage(decay=self.momentum)
			self.name = name

	def __call__(self, x, train=True):
		'''
		x: 4-d tensor
		'''
		shape = x.get_shape().as_list()

		if train:
			with tf.variable_scope(self.name) as scope:
				self.beta = tf.get_variable("beta", [shape[-1]],
									initializer=tf.constant_initializer(0.))
				self.gamma = tf.get_variable("gamma", [shape[-1]],
									initializer=tf.random_normal_initializer(1., 0.02))
				
				try:
					batch_mean, batch_var = tf.nn.moments(x, [0, 1, 2], name='moments')
				except:
					batch_mean, batch_var = tf.nn.moments(x, [0, 1], name='moments')
					
				ema_apply_op = self.ema.apply([batch_mean, batch_var])
				self.ema_mean, self.ema_var = self.ema.average(batch_mean), self.ema.average(batch_var)

				with tf.control_dependencies([ema_apply_op]):
					mean, var = tf.identity(batch_mean), tf.identity(batch_var)
		else:
			mean, var = self.ema_mean, self.ema_var

		normed = tf.nn.batch_norm_with_global_normalization(
				x, mean, var, self.beta, self.gamma, self.epsilon, scale_after_normalization=True)

		return normed

def binary_cross_entropy(preds, targets, name=None):
	"""Computes binary cross entropy given `preds`.
	For brevity, let `x = `, `z = targets`.  The logistic loss is
		loss(x, z) = - sum_i (x[i] * log(z[i]) + (1 - x[i]) * log(1 - z[i]))
	Args:
		preds: A `Tensor` of type `float32` or `float64`.
		targets: A `Tensor` of the same type and shape as `preds`.
	"""
	eps = 1e-12
	with ops.op_scope([preds, targets], name, "bce_loss") as name:
		preds = ops.convert_to_tensor(preds, name="preds")
		targets = ops.convert_to_tensor(targets, name="targets")
		return tf.reduce_mean(-(targets * tf.log(preds + eps) +
							  (1. - targets) * tf.log(1. - preds + eps)))

def conv_cond_concat(x, y):
	"""Concatenate conditioning vector on feature map axis."""
	x_shapes = x.get_shape()
	y_shapes = y.get_shape()
	return tf.concat(3, [x, y*tf.ones([x_shapes[0], x_shapes[1], x_shapes[2], y_shapes[3]])])

def conv2d(input_, output_dim, 
		   k_h=5, k_w=5, d_h=2, d_w=2, stddev=0.02,
		   name="conv2d"):
	with tf.variable_scope(name):
		w = tf.get_variable('w', [k_h, k_w, input_.get_shape()[-1], output_dim],
							initializer=tf.truncated_normal_initializer(stddev=stddev))
		conv = tf.nn.conv2d(input_, w, strides=[1, d_h, d_w, 1], padding='SAME')

		biases = tf.get_variable('biases', [output_dim], initializer=tf.constant_initializer(0.0))
		conv = tf.reshape(tf.nn.bias_add(conv, biases), conv.get_shape())

		return conv

def deconv2d(input_, output_shape,
			 k_h=5, k_w=5, d_h=2, d_w=2, stddev=0.02,
			 name="deconv2d", with_w=False):
	with tf.variable_scope(name):
		# filter : [height, width, output_channels, in_channels]
		w = tf.get_variable('w', [k_h, k_h, output_shape[-1], input_.get_shape()[-1]],
							initializer=tf.random_normal_initializer(stddev=stddev))
		
		try:
			deconv = tf.nn.conv2d_transpose(input_, w, output_shape=output_shape,
								strides=[1, d_h, d_w, 1])

		# Support for verisons of TensorFlow before 0.7.0
		except AttributeError:
			deconv = tf.nn.deconv2d(input_, w, output_shape=output_shape,
								strides=[1, d_h, d_w, 1])

		biases = tf.get_variable('biases', [output_shape[-1]], initializer=tf.constant_initializer(0.0))
		deconv = tf.reshape(tf.nn.bias_add(deconv, biases), deconv.get_shape())

		if with_w:
			return deconv, w, biases
		else:
			return deconv

def lrelu(x, leak=0.2, name="lrelu"):
	'''这里就是负数部分会出来一部分值, leaky relu'''
	return tf.maximum(x, leak*x)

def linear(input_, output_size, scope=None, stddev=0.02, bias_start=0.0, with_w=False):
	shape = input_.get_shape().as_list()

	with tf.variable_scope(scope or "Linear"):
		matrix = tf.get_variable("Matrix", [shape[1], output_size], tf.float32,
								 tf.random_normal_initializer(stddev=stddev))
		bias = tf.get_variable("bias", [output_size],
			initializer=tf.constant_initializer(bias_start))
		if with_w:
			return tf.matmul(input_, matrix) + bias, matrix, bias
		else:
			return tf.matmul(input_, matrix) + bias