(a) 證明softmax函式的一個性質,在輸入中存在偏移,但softmax的值是不隨著偏移而改變。在實踐中,我們認為這個偏移值一般是輸入中的最大值。

(b) 給出輸入矩陣,N行D列,然後計算每行的softmax函式值,最好是採用向量化來實現,以便為後續提供一個好的基礎。一個非向量化實現的方式,不會得到全部的分數。









import numpy as np

def softmax(x):
    """Compute the softmax function for each row of the input x.

    It is crucial that this function is optimized for speed because
    it will be used frequently in later code. You might find numpy
    functions np.exp, np.sum, np.reshape, np.max, and numpy
    broadcasting useful for this task.

    Numpy broadcasting documentation:

    You should also make sure that your code works for a single
    D-dimensional vector (treat the vector as a single row) and
    for N x D matrices. This may be useful for testing later. Also,
    make sure that the dimensions of the output match the input.

    You must implement the optimization in problem 1(a) of the
    written assignment!

    x -- A D dimensional vector or N x D dimensional numpy matrix.

    x -- You are allowed to modify x in-place
    orig_shape = x.shape

    if len(x.shape) > 1:
        # Matrix
        x = x - np.max(x, axis=1, keepdims=True)
        x = np.exp(x)/np.sum(np.exp(x), axis=1, keepdims=True)
        # Vector
        x = x - np.max(x)
        x = np.exp(x)/np.sum(np.exp(x))

    assert x.shape == orig_shape
    return x

def test_softmax_basic():
    Some simple tests to get you started.
    Warning: these are not exhaustive.
    print "Running basic tests..."
    test1 = softmax(np.array([1,2]))
    print test1
    ans1 = np.array([0.26894142,  0.73105858])
    assert np.allclose(test1, ans1, rtol=1e-05, atol=1e-06)

    test2 = softmax(np.array([[1001,1002],[3,4]]))
    print test2
    ans2 = np.array([
        [0.26894142, 0.73105858],
        [0.26894142, 0.73105858]])
    assert np.allclose(test2, ans2, rtol=1e-05, atol=1e-06)

    test3 = softmax(np.array([[-1001,-1002]]))
    print test3
    ans3 = np.array([0.73105858, 0.26894142])
    assert np.allclose(test3, ans3, rtol=1e-05, atol=1e-06)

    print "You should be able to verify these results by hand!\n"

if __name__ == "__main__":








(b)推導梯度下降(採用交叉熵的softmax函式), 此時class label可以視為0-1的one-hot編碼形式,也就是隻有一個1,其餘均為0。

(c)推導梯度下降,輸入x,只有一層隱藏層的神經網路,損失函式利用交叉熵來度量,神經網路中啟用函式利用sigmod函式來作為啟用函式,利用softmax函式來作用於輸出層, 標籤採用one-hot的形式。(其實就是神經網路的常規推導)








(b) 輸出層的求導情況




(e) 編寫sigmod函式及其求導函式

#!/usr/bin/env python

import numpy as np

def sigmoid(x):
    Compute the sigmoid function for the input here.

    x -- A scalar or numpy array.

    s -- sigmoid(x)
    s = 1 / (1 + np.exp(-x))
    return s

def sigmoid_grad(s):
    Compute the gradient for the sigmoid function here. Note that
    for this implementation, the input s should be the sigmoid
    function value of your original input x.

    s -- A scalar or numpy array.

    ds -- Your computed gradient.
    ds = s * (1 - s)

    return ds

def test_sigmoid_basic():
    Some simple tests to get you started.
    Warning: these are not exhaustive.
    print "Running basic tests..."
    x = np.array([[1, 2], [-1, -2]])
    f = sigmoid(x)
    g = sigmoid_grad(f)
    print f
    f_ans = np.array([
        [0.73105858, 0.88079708],
        [0.26894142, 0.11920292]])
    assert np.allclose(f, f_ans, rtol=1e-05, atol=1e-06)
    print g
    g_ans = np.array([
        [0.19661193, 0.10499359],
        [0.19661193, 0.10499359]])
    assert np.allclose(g, g_ans, rtol=1e-05, atol=1e-06)
    print "You should verify these results by hand!\n"

if __name__ == "__main__":


(f) 梯度檢查,利用雙邊檢查,得到的精確度更高。

#!/usr/bin/env python

import numpy as np
import random

# First implement a gradient checker by filling in the following functions
def gradcheck_naive(f, x):
    """ Gradient check for a function f.

    f -- a function that takes a single argument and outputs the
         cost and its gradients
    x -- the point (numpy array) to check the gradient at

    rndstate = random.getstate()
    fx, grad = f(x)  # Evaluate function value at original point
    h = 1e-4         # Do not change this!

    # Iterate over all indexes ix in x to check the gradient.
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index

        # Try modifying x[ix] with h defined above to compute numerical
        # gradients (numgrad).

        # Use the centered difference of the gradient.
        # It has smaller asymptotic error than forward / backward difference
        # methods. If you are curious, check out here:
        # https://math.stackexchange.com/questions/2326181/when-to-use-forward-or-central-difference-approximations

        # Make sure you call random.setstate(rndstate)
        # before calling f(x) each time. This will make it possible
        # to test cost functions with built in randomness later.
        x[ix] += h
        f1 = f(x)[0]
        x[ix] -= 2 * h
        f2 = f(x)[0]
        x[ix] += h
        numgrad = (f1-f2)/(2*h)

        # Compare gradients
        reldiff = abs(numgrad - grad[ix]) / max(1, abs(numgrad), abs(grad[ix]))
        if reldiff > 1e-5:
            print "Gradient check failed."
            print "First gradient error found at index %s" % str(ix)
            print "Your gradient: %f \t Numerical gradient: %f" % (
                grad[ix], numgrad)

        it.iternext() # Step to next dimension

    print "Gradient check passed!"

def sanity_check():
    Some basic sanity checks.
    quad = lambda x: (np.sum(x ** 2), x * 2)

    print "Running sanity checks..."
    gradcheck_naive(quad, np.array(123.456))      # scalar test
    gradcheck_naive(quad, np.random.randn(3,))    # 1-D test
    gradcheck_naive(quad, np.random.randn(4,5))   # 2-D test
    print ""

if __name__ == "__main__":

程式解釋:在函式gradcheck_naive(f, x) , 其中f是一個函式,接受一個引數的函式,返回的是一個元祖,包含二項,第一項為損失函式cost的數值,第二項為梯度數值;x為進行檢測的輸入的數值,可以是標量,也可以是矩陣(向量)。設定了一個隨機種子,以便你的測試是同一的隨機種子產生,產生正確的結果。然後np.nditer就是一個迭代器,多重索引的迭代器,然後基於索引的基礎上,然後進行雙邊的梯度檢查,然後換下一個資料進行迭代。



(g) 最後一個是實現二層的神經網路(其中一層為隱藏層,一層為輸出層)

#!/usr/bin/env python

import numpy as np
import random

from q1_softmax import softmax
from q2_sigmoid import sigmoid, sigmoid_grad
from q2_gradcheck import gradcheck_naive

def forward_backward_prop(X, labels, params, dimensions):
    Forward and backward propagation for a two-layer sigmoidal network

    Compute the forward propagation and for the cross entropy cost,
    the backward propagation for the gradients for all parameters.

    Notice the gradients computed here are different from the gradients in
    the assignment sheet: they are w.r.t. weights, not inputs.

    X -- M x Dx matrix, where each row is a training example x.
    labels -- M x Dy matrix, where each row is a one-hot vector.
    params -- Model parameters, these are unpacked for you.
    dimensions -- A tuple of input dimension, number of hidden units
                  and output dimension

    ### Unpack network parameters (do not modify)
    ofs = 0
    Dx, H, Dy = (dimensions[0], dimensions[1], dimensions[2])

    W1 = np.reshape(params[ofs:ofs + Dx * H], (Dx, H))
    ofs += Dx * H
    b1 = np.reshape(params[ofs:ofs + H], (1, H))
    ofs += H
    W2 = np.reshape(params[ofs:ofs + H * Dy], (H, Dy))
    ofs += H * Dy
    b2 = np.reshape(params[ofs:ofs + Dy], (1, Dy))

    # Note: compute cost based on `sum` not `mean`.
    z1 = X.dot(W1) + b1
    a1 = sigmoid(z1)
    z2 = a1.dot(W2) + b2
    a2 = softmax(z2)
    cost = -np.sum(labels * np.log(a2))

    gradz2 = (a2 - labels)
    gradW2 = a1.T.dot(gradz2)
    gradb2 = np.sum(gradz2, axis=0, keepdims=True)
    grada1 = gradz2.dot(W2.T)
    gradz1 = grada1*sigmoid_grad(a1)
    gradW1 = X.T.dot(gradz1)
    gradb1 = np.sum(gradz1, axis=0, keepdims=True)

    ### Stack gradients (do not modify)
    grad = np.concatenate((gradW1.flatten(), gradb1.flatten(), gradW2.flatten(), gradb2.flatten()))
    grad.resize((len(grad), 1))

    return cost, grad

def sanity_check():
    Set up fake data and parameters for the neural network, and test using
    print "Running sanity check..."

    N = 20
    dimensions = [10, 5, 10]
    data = np.random.randn(N, dimensions[0])   # each row will be a datum
    labels = np.zeros((N, dimensions[2]))
    for i in xrange(N):
        labels[i, random.randint(0, dimensions[2]-1)] = 1

    params = np.random.randn((dimensions[0] + 1) * dimensions[1] + (
        dimensions[1] + 1) * dimensions[2], 1)

    gradcheck_naive(lambda params: forward_backward_prop(data, labels, params, dimensions), params)

if __name__ == "__main__":







(a) 中心詞的索引為c,預測索引為o的詞是否為中心詞的視窗範圍的詞,其中u(w)為字典中的所有的詞的詞向量,其實就是用二套詞向量來進行表示,方便解耦合,簡化學習過程。說了這麼多,這個題目就是求一個梯度。


(c)在(a)與(b)中,採用的傳統的,也就是初步的word2vec來實現的,但我們知道採用負取樣的方法,實現效率更高。所以,這個題目就是用來驗證這個結論。用CE loss的執行時間除以negative sampling loss的執行時間來作為speed-up ratio。

(d)word2vec中有二種類別,一種為CBOW, 另一種為skip-gram。視窗大小為m, 然後二種方式的梯度的推導。這是一個不斷擴充套件的問題,一步步的從抽象的情況,擴充套件到具體的情況。







(c)word2vec的負取樣實現中,一次迭代中只需要計算的是K+1個數據, 而對於傳統的softmax方式中,則需要計算的是W+1個數據,所以,時間花費大約為(W+1)/(K+1)

(d)對於skip-gram而言, 推導如下:






對於softmaxCostAndGradient函式,就是通過(a)(b)的公式來實現。但一定注意的是矩陣的維度,尤其是對於(3L, )這個問題,很容易出現各種莫名其妙的問題,所以最好就把矩陣的維度統一,這樣比較容易求。






#!/usr/bin/env python
# -*- coding:utf-8 -*-

import numpy as np
import random

from q1_softmax import softmax
from q2_gradcheck import gradcheck_naive
from q2_sigmoid import sigmoid, sigmoid_grad

def normalizeRows(x):
    """ Row normalization function
    # 除以模長的歸一化方法
    Implement a function that normalizes each row of a matrix to have
    unit length.

    x = x / (np.sqrt(np.sum(x*x, axis=1, keepdims=True)))
    return x

def test_normalize_rows():
    print "Testing normalizeRows..."
    x = normalizeRows(np.array([[3.0,4.0],[1, 2]]))
    print x
    ans = np.array([[0.6,0.8],[0.4472136,0.89442719]])
    assert np.allclose(x, ans, rtol=1e-05, atol=1e-06)
    print ""

def softmaxCostAndGradient(predicted, target, outputVectors, dataset):
    """ Softmax cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, assuming the softmax prediction function and cross
    entropy loss.

    predicted -- numpy ndarray, predicted word vector (\hat{v} in
                 the written component)
    target -- integer, the index of the target word
    outputVectors -- "output" vectors (as rows) for all tokens
    dataset -- needed for negative sampling, unused here.

    cost -- cross entropy cost for the softmax word prediction
    gradPred -- the gradient with respect to the predicted word
    grad -- the gradient with respect to all the other word

    We will not provide starter code for this function, but feel
    free to reference the code you previously wrote for this
    # 為了避免出錯,最好利用reshape來將矩陣來轉變為自己需要的那一種型別, 因為softmax是對行來進行
    predicted = predicted.reshape([1, predicted.shape[0]])
    y_hot = softmax(predicted.dot(outputVectors.T)).reshape([outputVectors.shape[0], 1])
    y_real = np.zeros_like(y_hot)
    y_real[target] = 1
    cost = -np.log(y_hot[target])
    gradPred = (y_hot-y_real).T.dot(outputVectors)
    grad = (y_hot-y_real).dot(predicted)
    return cost, gradPred, grad

def getNegativeSamples(target, dataset, K):

    """ Samples K indexes which are not the target

    indices = [None] * K
    for k in xrange(K):
        newidx = dataset.sampleTokenIdx()
        while newidx == target:
            newidx = dataset.sampleTokenIdx()
        indices[k] = newidx
    return indices

def negSamplingCostAndGradient(predicted, target, outputVectors, dataset,
    """ Negative sampling cost function for word2vec models

    Implement the cost and gradients for one predicted word vector
    and one target word vector as a building block for word2vec
    models, using the negative sampling technique. K is the sample

    Note: See test_word2vec below for dataset's initialization.

    Arguments/Return Specifications: same as softmaxCostAndGradient

    # Sampling of indices is done for you. Do not modify this if you
    # wish to match the autograder and receive points!

    indices = [target]
    indices.extend(getNegativeSamples(target, dataset, K))

    predicted = predicted.reshape([predicted.shape[0], 1])
    gradPred = np.zeros(predicted.shape)
    cost = 0

    soft_vc = sigmoid(outputVectors[target, :].dot(predicted))  # [1, D]*[D, 1]=[1, 1]
    cost -= np.log(soft_vc)
    gradPred += (soft_vc-1.0) * outputVectors[target, :].reshape(predicted.shape)  # [D,1]
    grad_temp = np.zeros([outputVectors.shape[0], 1])    # [M, 1]
    grad_temp[target] = soft_vc-1.0
    for i in range(1, len(indices)):
        soft_vk = sigmoid(-outputVectors[indices[i], :].dot(predicted))
        cost -= np.log(soft_vk)
        gradPred -= (soft_vk-1.0) * outputVectors[indices[i], :].reshape(predicted.shape)
        grad_temp[indices[i]] -= (soft_vk-1.0)
    grad = grad_temp.dot(predicted.T)   # [M, 1]*[1, D]=[M, D]

    return cost, gradPred, grad

def skipgram(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
             dataset, word2vecCostAndGradient=softmaxCostAndGradient):
    """ Skip-gram model in word2vec

    Implement the skip-gram model in this function.

    currentWord -- a string of the current center word
    C -- integer, context size
    contextWords -- list of no more than 2*C strings, the context words
    tokens -- a dictionary that maps words to their indices in
              the word vector list
    inputVectors -- "input" word vectors (as rows) for all tokens
    outputVectors -- "output" word vectors (as rows) for all tokens
    word2vecCostAndGradient -- the cost and gradient function for
                               a prediction vector given the target
                               word vectors, could be one of the two
                               cost functions you implemented above.

    cost -- the cost function value for the skip-gram model
    grad -- the gradient with respect to the word vectors

    cost = 0.0
    gradIn = np.zeros(inputVectors.shape)
    gradOut = np.zeros(outputVectors.shape)

    for word in contextWords:
        cost_1, gradPred1, grad1 = word2vecCostAndGradient(inputVectors[tokens[currentWord]], tokens[word],
                                                              outputVectors, dataset)
        cost += cost_1
        gradIn[tokens[currentWord], :] += np.squeeze([gradPred1])
        gradOut += grad1

    return cost, gradIn, gradOut

def cbow(currentWord, C, contextWords, tokens, inputVectors, outputVectors,
         dataset, word2vecCostAndGradient=softmaxCostAndGradient):
    """CBOW model in word2vec

    Implement the continuous bag-of-words model in this function.

    Arguments/Return specifications: same as the skip-gram model

    Extra credit: Implementing CBOW is optional, but the gradient
    derivations are not. If you decide not to implement CBOW, remove
    the NotImplementedError.

    cost = 0.0
    gradIn = np.zeros(inputVectors.shape)
    gradOut = np.zeros(outputVectors.shape)

    raise NotImplementedError

    return cost, gradIn, gradOut

# Testing functions below. DO NOT MODIFY!   #

def word2vec_sgd_wrapper(word2vecModel, tokens, wordVectors, dataset, C,
    batchsize = 50
    cost = 0.0
    grad = np.zeros(wordVectors.shape)
    N = wordVectors.shape[0]
    inputVectors = wordVectors[:N/2,:]
    outputVectors = wordVectors[N/2:,:]
    for i in xrange(batchsize):
        C1 = random.randint(1,C)
        centerword, context = dataset.getRandomContext(C1)
        if word2vecModel == skipgram:
            denom = 1
            denom = 1
        c, gin, gout = word2vecModel(
            centerword, C1, context, tokens, inputVectors, outputVectors,
            dataset, word2vecCostAndGradient)
        cost += c / batchsize / denom
        grad[:N/2, :] += gin / batchsize / denom
        grad[N/2:, :] += gout / batchsize / denom

    return cost, grad

def test_word2vec():
    """ Interface to the dataset for negative sampling """
    dataset = type('dummy', (), {})()
    def dummySampleTokenIdx():
        return random.randint(0, 4)

    def getRandomContext(C):
        tokens = ["a", "b", "c", "d", "e"]
        return tokens[random.randint(0,4)], \
            [tokens[random.randint(0,4)] for i in xrange(2*C)]
    dataset.sampleTokenIdx = dummySampleTokenIdx
    dataset.getRandomContext = getRandomContext

    dummy_vectors = normalizeRows(np.random.randn(10,3))
    dummy_tokens = dict([("a",0), ("b",1), ("c",2),("d",3),("e",4)])
    print "==== Gradient check for skip-gram ===="
    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
        skipgram, dummy_tokens, vec, dataset, 5, softmaxCostAndGradient),
    gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
        skipgram, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient),
    # print "\n==== Gradient check for CBOW      ===="
    # gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
    #     cbow, dummy_tokens, vec, dataset, 5, softmaxCostAndGradient),
    #     dummy_vectors)
    # gradcheck_naive(lambda vec: word2vec_sgd_wrapper(
    #     cbow, dummy_tokens, vec, dataset, 5, negSamplingCostAndGradient),
    #     dummy_vectors)

    print "\n=== Results ==="
    print skipgram("c", 3, ["a", "b", "e", "d", "b", "c"],
        dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
    print skipgram("c", 1, ["a", "b"],
        dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset,
    # print cbow("a", 2, ["a", "b", "c", "a"],
    #     dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset)
    # print cbow("a", 2, ["a", "b", "a", "c"],
    #     dummy_tokens, dummy_vectors[:5,:], dummy_vectors[5:,:], dataset,
    #     negSamplingCostAndGradient)

if __name__ == "__main__":


拿到一個這樣需要補全的程式,第一步首先需要明確自己需要補全的那部分程式是那部分了,第二步,if __name__=='__main__'看起,因為這個是程式的入口,然後根據程式的流程來理解。





#!/usr/bin/env python

# Save parameters every a few SGD iterations as fail-safe

import glob
import random
import numpy as np
import os.path as op
import cPickle as pickle

def load_saved_params():
    A helper function that loads previously saved parameters and resets
    iteration start.
    st = 0
    for f in glob.glob("saved_params_*.npy"):
        iter = int(op.splitext(op.basename(f))[0].split("_")[2])
        if (iter > st):
            st = iter

    if st > 0:
        with open("saved_params_%d.npy" % st, "r") as f:
            params = pickle.load(f)
            state = pickle.load(f)
        return st, params, state
        return st, None, None

def save_params(iter, params):
    with open("saved_params_%d.npy" % iter, "w") as f:
        pickle.dump(params, f)
        pickle.dump(random.getstate(), f)

def sgd(f, x0, step, iterations, postprocessing=None, useSaved=False,
    """ Stochastic Gradient Descent

    Implement the stochastic gradient descent method in this function.

    f -- the function to optimize, it should take a single
         argument and yield two outputs, a cost and the gradient
         with respect to the arguments
    x0 -- the initial point to start SGD from
    step -- the step size for SGD
    iterations -- total iterations to run SGD for
    postprocessing -- postprocessing function for the parameters
                      if necessary. In the case of word2vec we will need to
                      normalize the word vectors to have unit length.
    PRINT_EVERY -- specifies how many iterations to output loss

    x -- the parameter value after SGD finishes

    # Anneal learning rate every several iterations
    ANNEAL_EVERY = 20000

    if useSaved:
        start_iter, oldx, state = load_saved_params()
        if start_iter > 0:
            x0 = oldx
            step *= 0.5 ** (start_iter / ANNEAL_EVERY)

        if state:
        start_iter = 0

    x = x0

    if not postprocessing:
        postprocessing = lambda x: x

    expcost = None

    for iter in xrange(start_iter + 1, iterations + 1):
        # Don't forget to apply the postprocessing after every iteration!
        # You might want to print the progress every few iterations.

        cost = None
        ### YOUR CODE HERE
        cost, grad = f(x)
        x -= step * grad
        ### END YOUR CODE

        if iter % PRINT_EVERY == 0:
            if not expcost:
                expcost = cost
                expcost = .95 * expcost + .05 * cost
            print "iter %d: %f" % (iter, expcost)

        if iter % SAVE_PARAMS_EVERY == 0 and useSaved:
            save_params(iter, x)

        if iter % ANNEAL_EVERY == 0:
            step *= 0.5

    return x

def sanity_check():
    quad = lambda x: (np.sum(x ** 2), x * 2)

    print "Running sanity checks..."
    t1 = sgd(quad, 0.5, 0.01, 1000, PRINT_EVERY=100)
    print "test 1 result:", t1
    assert abs(t1) <= 1e-6

    t2 = sgd(quad, 0.0, 0.01, 1000, PRINT_EVERY=100)
    print "test 2 result:", t2
    assert abs(t2) <= 1e-6

    t3 = sgd(quad, -1.5, 0.01, 1000, PRINT_EVERY=100)
    print "test 3 result:", t3
    assert abs(t3) <= 1e-6

    print ""

if __name__ == "__main__":

(g) 訓練一個語料庫,程式碼如下。




#!/usr/bin/env python

import random
import numpy as np
from utils.treebank import StanfordSentiment
import matplotlib
import matplotlib.pyplot as plt
import time

from q3_word2vec import *
from q3_sgd import *

# Reset the random seed to make sure that everyone gets the same results
dataset = StanfordSentiment()
tokens = dataset.tokens()
nWords = len(tokens)

# We are going to train 10-dimensional vectors for this assignment
dimVectors = 10

# Context size
C = 5

# Reset the random seed to make sure that everyone gets the same results

wordVectors = np.concatenate(
    ((np.random.rand(nWords, dimVectors) - 0.5) /
       dimVectors, np.zeros((nWords, dimVectors))),
wordVectors = sgd(
    lambda vec: word2vec_sgd_wrapper(skipgram, tokens, vec, dataset, C,
    wordVectors, 0.3, 40000, None, True, PRINT_EVERY=10)
# Note that normalization is not called here. This is not a bug,
# normalizing during training loses the notion of length.

print "sanity check: cost at convergence should be around or below 10"
print "training took %d seconds" % (time.time() - startTime)

# concatenate the input and output word vectors
wordVectors = np.concatenate(
    (wordVectors[:nWords,:], wordVectors[nWords:,:]),
# wordVectors = wordVectors[:nWords,:] + wordVectors[nWords:,:]

visualizeWords = [
    "the", "a", "an", ",", ".", "?", "!", "``", "''", "--",
    "good", "great", "cool", "brilliant", "wonderful", "well", "amazing",
    "worth", "sweet", "enjoyable", "boring", "bad", "waste", "dumb",

visualizeIdx = [tokens[word] for word in visualizeWords]
visualizeVecs = wordVectors[visualizeIdx, :]
temp = (visualizeVecs - np.mean(visualizeVecs, axis=0))
covariance = 1.0 / len(visualizeIdx) * temp.T.dot(temp)
U,S,V = np.linalg.svd(covariance)
coord = temp.dot(U[:,0:2])

for i in xrange(len(visualizeWords)):
    plt.text(coord[i,0], coord[i,1], visualizeWords[i],
        bbox=dict(facecolor='green', alpha=0.1))

plt.xlim((np.min(coord[:,0]), np.max(coord[:,0])))
plt.ylim((np.min(coord[:,1]), np.max(coord[:,1])))





很負面(0),負面(1),中立(2), 積極(3),很積極(4)


(b)解釋為什麼需要在分類時進行正則化。(normalization, regulariza就tion)














Best regularization value: 7.74E-02
Test accuracy (%): 30.045249


True	Predicted	Text
3	4	it 's a lovely film with lovely performances by buy and accorsi .
2	1	no one goes unindicted here , which is probably for the best .
3	1	and if you 're not nearly moved to tears by a couple of scenes , you 've got ice water in your veins .






















1)對於混淆矩陣的畫法,不是僅僅得到一個矩陣就OK, 也是可以畫出影象的,很美觀。





#!/usr/bin/env python
# -*- coding:utf-8 -*-

import argparse
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import itertools

from utils.treebank import StanfordSentiment
import utils.glove as glove

from q3_sgd import load_saved_params, sgd

# We will use sklearn here because it will run faster than implementing
# ourselves. However, for other parts of this assignment you must implement
# the functions yourself!
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

def getArguments():
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--pretrained", dest="pretrained", action="store_true",
                       help="Use pretrained GloVe vectors.")
    group.add_argument("--yourvectors", dest="yourvectors", action="store_true",
                       help="Use your vectors from q3.")
    return parser.parse_args()

def getSentenceFeatures(tokens, wordVectors, sentence):
    Obtain the sentence feature for sentiment analysis by averaging its
    word vectors

    # Implement computation for the sentence features given a sentence.

    # Inputs:
    # tokens -- a dictionary that maps words to their indices in
    #           the word vector list
    # wordVectors -- word vectors (each row) for all tokens
    # sentence -- a list of words in the sentence of interest

    # Output:
    # - sentVector: feature vector for the sentence

    sentVector = np.zeros((wordVectors.shape[1],))

    for word in sentence:
        sentVector += wordVectors[tokens[word]]
    sentVector *= 1.0/len(sentence)

    assert sentVector.shape == (wordVectors.shape[1],)
    return sentVector

def getRegularizationValues():
    """Try different regularizations

    Return a sorted list of values to try.
    # Assign a list of floats in the block below
    values = np.logspace(-2, 2, num=100, base=10)
    return sorted(values)

def chooseBestModel(results):
    """Choose the best model based on dev set performance.

    results -- A list of python dictionaries of the following format:
            "reg": regularization,
            "clf": classifier,
            "train": trainAccuracy,
            "dev": devAccuracy,
            "test": testAccuracy

    Each dictionary represents the performance of one model.

    Your chosen result dictionary.
    # 對於利用dev的關鍵字來進行排序
    bestResult = max(results, key=lambda x: x['dev'])

    return bestResult

def accuracy(y, yhat):
    """ Precision for classifier """
    assert(y.shape == yhat.shape)
    return np.sum(y == yhat) * 100.0 / y.size

def plotRegVsAccuracy(regValues, results, filename):
    """ Make a plot of regularization vs accuracy """
    plt.plot(regValues, [x["train"] for x in results])
    plt.plot(regValues, [x["dev"] for x in results])
    plt.legend(['train', 'dev'], loc='upper left')

def outputConfusionMatrix(features, labels, clf, filename):
    """ Generate a confusion matrix """
    pred = clf.predict(features)
    cm = confusion_matrix(labels, pred, labels=range(5))
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Reds)
    classes = ["- -", "-", "neut", "+", "+ +"]
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 color="white" if cm[i, j] > thresh else "black")
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def outputPredictions(dataset, features, labels, clf, filename):
    """ Write the predictions to file """
    pred = clf.predict(features)
    with open(filename, "w") as f:
        print >> f, "True\tPredicted\tText"
        for i in xrange(len(dataset)):
            print >> f, "%d\t%d\t%s" % (
                labels[i], pred[i], " ".join(dataset[i][0]))

def main(args):
    """ Train a model to do sentiment analyis"""

    # Load the dataset
    dataset = StanfordSentiment()
    tokens = dataset.tokens()
    nWords = len(tokens)

    if args.yourvectors:
        _, wordVectors, _ = load_saved_params()
        wordVectors = np.concatenate(
            (wordVectors[:nWords,:], wordVectors[nWords:,:]),
    elif args.pretrained:
        wordVectors = glove.loadWordVectors(tokens)
    dimVectors = wordVectors.shape[1]

    # Load the train set
    trainset = dataset.getTrainSentences()
    nTrain = len(trainset)
    trainFeatures = np.zeros((nTrain, dimVectors))
    trainLabels = np.zeros((nTrain,), dtype=np.int32)
    for i in xrange(nTrain):
        words, trainLabels[i] = trainset[i]
        trainFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare dev set features
    devset = dataset.getDevSentences()
    nDev = len(devset)
    devFeatures = np.zeros((nDev, dimVectors))
    devLabels = np.zeros((nDev,), dtype=np.int32)
    for i in xrange(nDev):
        words, devLabels[i] = devset[i]
        devFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # Prepare test set features
    testset = dataset.getTestSentences()
    nTest = len(testset)
    testFeatures = np.zeros((nTest, dimVectors))
    testLabels = np.zeros((nTest,), dtype=np.int32)
    for i in xrange(nTest):
        words, testLabels[i] = testset[i]
        testFeatures[i, :] = getSentenceFeatures(tokens, wordVectors, words)

    # We will save our results from each run
    results = []
    regValues = getRegularizationValues()
    for reg in regValues:
        print "Training for reg=%f" % reg
        # Note: add a very small number to regularization to please the library
        clf = LogisticRegression(C=1.0/(reg + 1e-12))
        clf.fit(trainFeatures, trainLabels)

        # Test on train set
        pred = clf.predict(trainFeatures)
        trainAccuracy = accuracy(trainLabels, pred)
        print "Train accuracy (%%): %f" % trainAccuracy

        # Test on dev set
        pred = clf.predict(devFeatures)
        devAccuracy = accuracy(devLabels, pred)
        print "Dev accuracy (%%): %f" % devAccuracy

        # Test on test set
        # Note: always running on test is poor style. Typically, you should
        # do this only after validation.
        pred = clf.predict(testFeatures)
        testAccuracy = accuracy(testLabels, pred)
        print "Test accuracy (%%): %f" % testAccuracy

            "reg": reg,
            "clf": clf,
            "train": trainAccuracy,