DataScienceII/StochasticSVD.py at master · NBKlepp/DataScienceII · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
'''
This is an implementation of stochastic SVD, comparing the results of this implementation with the "stock" sklearn implementation.
We have implemented power iterations as well as oversampling to improve performance.
The results of the stochastic SVD solver implemented here are compared with a deterministic SVD solver implemented by sci-kit learn,
as well as the "stock" stochastic SVD solver implemented in sklearn. The results are plotted to the screen.
'''
import numpy as np
import argparse
from sklearn import datasets
import numpy.linalg
from matplotlib import pyplot as plt
from collections import OrderedDict
from sklearn.decomposition import PCA

#Constants used later to generate different plots
DEBUG = False
OVERSAMPLE = True
N_OVERSAMPLE = False
POWER = True
N_POWER = False

'''
The stochastic SVD solver.
First compute the preconditioning matrix.
    Given a matrix of data, A, compute Y = A*Omega, where Omega ~ N(0,1)
    Perform QR decomposition on Y to computer Y=QR, where Q is our preconditioning matrix.
    If we are interested in using power itereations, they may then be performed.
Next, precondition the system by finding B = Q^t * A.
    The singular values of the preconditioned matrix are much easier to solve for, and a
    good approxiation of the singular values of our original system after we project back
    them back into the original space.
@param A  the data
@param k  the number of singular values we want to find
@param oversample  whether to use oversampling
@param power       whether to use power iterations
@param q           the number of power iterations to perform (if performing)
@return            a tuple (l,s,r) of left_singular_vectors, the singular values, and right singular vectors
'''
def SSVD(A, k, oversample,power,q):
    #the size of the data
    m = A.shape[1]
    #find Y=A*Omega
    OMEGA = np.random.normal(0,1,m*k).reshape(m,k) if oversample == False else np.random.normal(0,1,m*2*k).reshape(m,2*k)
    Y = np.dot(A,OMEGA)
    #find Q using QR decomposition on Y
    Q,R = np.linalg.qr(Y)
    #use power iterations if necessary
    if power : Q = powerIter(Q,q,A)
    #precondition the system.
    B = np.dot(Q.T,A)
    #extract the singular values.
    U_hat,s,V_hat_T = np.linalg.svd(B)
    V_hat = V_hat_T.T
    #project back into the original space (i.e. - pre-pre-conditioned...)
    lsv = np.dot(Q,U_hat)
    rsv = V_hat[:,0:k]
    ret = (lsv,s,rsv) if not oversample else (lsv,s[0:k],rsv)

    if DEBUG :
        print("s:",s)
        print("U_hat.shape:", U_hat.shape, "\ns.size:", s.size, "\nV_hat_t.shape:", V_hat_T.shape)
        print("ret:",ret)
    return ret
'''
A method to perform power iterations on a preconditioning matrix.
    Let Q_0 be the original preconditioning matrix.
    For i in 1 to q :
        Y = A * A^T * Q_i-1
        Q_i, R = QR_decomp(Y)

@param Q  the preconditioning matrix
@param q  the number of power iterations to perform
@param A  the original data matrix
'''
def powerIter(Q,q,A):
    for i in np.arange(q):
        Y = np.dot(A,np.dot(A.T,Q))
        Q = np.linalg.qr(Y)[0]
    return Q

if __name__ == "__main__":

    parser = argparse.ArgumentParser(description = "Assignment 5 Question 3",
        epilog = "CSCI 4360/6360 Data Science II: Fall 2017",
        add_help = "How to use",
        prog = "python assignment5_q3.py -k <basis> -p <power-iterations>")

    # args.

    parser.add_argument("-k", "--basis", default = 10, type = int,
        help = "The number of basis vectors.")
    parser.add_argument("-p", "--power", default = 3, type = int,
        help = "The number of basis vectors.")

    args = vars(parser.parse_args())

    k = args['basis']
    q = args['power']

    #run tests on the MNIST dataset
    A = datasets.load_digits().data
    #the deterministic SVD solution (for comparison's sake)
    U,s,V_t = np.linalg.svd(A)
    deterministicSingularValues = s[0:k]

    #space for results
    stochasticRes = np.array([]*k).reshape(0,k)
    overSampleRes = np.array([]*k).reshape(0,k)
    q_vals = np.arange(1,q+1)
    power1 = np.array([]*k).reshape(0,k)
    power2 = np.array([]*k).reshape(0,k)
    power3 = np.array([]*k).reshape(0,k)
    powers = [power1,power2,power3]

    #some parameters for plotting
    q_colors = ['c','m','y']

    #run the SSVD solver 10 times without oversampling and without power iterations, storing the results
    for i in np.arange(10):
        if DEBUG : print("stochasticRes,SSVD(A,k,N_OVERSAMPLE)[1]:", SSVD(A,k,N_OVERSAMPLE,N_POWER,0)[1])
        s = SSVD(A,k,N_OVERSAMPLE,N_POWER,0)[1]
        stochasticRes = np.concatenate([stochasticRes, np.array([s])])
    #run the SSVD solver 10 times with oversampling but without power iterations, storing the results
    for i in np.arange(10):
        s = SSVD(A,k,OVERSAMPLE,N_POWER,0)[1]
        if DEBUG : print("s.size from OVERSAMPLE results:",s.size)
        overSampleRes = np.concatenate([overSampleRes, np.array([s])])
    #run the SSVD solver 10 times with oversampling and power iterations, storing the results
    for q in q_vals:
        for i in np.arange(10):
            s = SSVD(A,k,OVERSAMPLE,POWER,q)[1]
            if DEBUG : print("s.size from OVERSAMPLE results:",s.size)
            powers[q-1] = np.concatenate([powers[q-1], np.array([s])])

    #finding the SSVD results using the stock sklearn package
    pca = PCA(n_components=k,svd_solver = 'randomized')
    pca.fit(A)
    pca2 = PCA(n_components=k,svd_solver = 'randomized',iterated_power = 3)
    pca2.fit(A)

    #for use as the x-axis in our plots
    x = np.arange(1,k+1)
    if DEBUG : print("x.size:",x.size)

    #the difference is so small, it can be hard to see!
    if DEBUG : print("Diff:",pca.singular_values_ - pca2.singular_values_)

    plots = []

    plots = plots + [plt.scatter(x, deterministicSingularValues,marker="x",label = "deterministic",c='r')]

    for s_val in stochasticRes :
        if DEBUG: print("s_val.size:",s_val.size)
        plots = plots + [plt.scatter(x,s_val,marker = "+",label = "no_oversample",c='b')]

    for s_val in overSampleRes :
        plots = plots + [plt.scatter(x,s_val,marker = "^",label = "oversampled",c='g')]

    for q in q_vals:
        for s_val in powers[q-1] :
            plots = plots + [plt.scatter(x,s_val,marker = "*",label = "power" + str(q) ,c=q_colors[q-1])]

    plots = plots + [plt.scatter(x,pca.singular_values_,marker = "v",label = "Halko0Power",c='k')]
    plots = plots + [plt.scatter(x,pca2.singular_values_,marker = "v",label = "Halko3Power",c='xkcd:sky blue')]
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels,handles))
    plt.legend(by_label.values(),by_label.keys())
    plt.title("Comparing the SVD Solvers")
    plt.show()