Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/scipy/linalg/_sketches.py : 43%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1""" Sketching-based Matrix Computations """
3# Author: Jordi Montes <jomsdev@gmail.com>
4# August 28, 2017
6import numpy as np
8from scipy._lib._util import check_random_state, rng_integers
9from scipy.sparse import csc_matrix
11__all__ = ['clarkson_woodruff_transform']
14def cwt_matrix(n_rows, n_columns, seed=None):
15 r""""
16 Generate a matrix S which represents a Clarkson-Woodruff transform.
18 Given the desired size of matrix, the method returns a matrix S of size
19 (n_rows, n_columns) where each column has all the entries set to 0
20 except for one position which has been randomly set to +1 or -1 with
21 equal probability.
23 Parameters
24 ----------
25 n_rows: int
26 Number of rows of S
27 n_columns: int
28 Number of columns of S
29 seed : None or int or `numpy.random.RandomState` instance, optional
30 This parameter defines the ``RandomState`` object to use for drawing
31 random variates.
32 If None (or ``np.random``), the global ``np.random`` state is used.
33 If integer, it is used to seed the local ``RandomState`` instance.
34 Default is None.
36 Returns
37 -------
38 S : (n_rows, n_columns) csc_matrix
39 The returned matrix has ``n_columns`` nonzero entries.
41 Notes
42 -----
43 Given a matrix A, with probability at least 9/10,
44 .. math:: \|SA\| = (1 \pm \epsilon)\|A\|
45 Where the error epsilon is related to the size of S.
46 """
47 rng = check_random_state(seed)
48 rows = rng_integers(rng, 0, n_rows, n_columns)
49 cols = np.arange(n_columns+1)
50 signs = rng.choice([1, -1], n_columns)
51 S = csc_matrix((signs, rows, cols),shape=(n_rows, n_columns))
52 return S
55def clarkson_woodruff_transform(input_matrix, sketch_size, seed=None):
56 r""""
57 Applies a Clarkson-Woodruff Transform/sketch to the input matrix.
59 Given an input_matrix ``A`` of size ``(n, d)``, compute a matrix ``A'`` of
60 size (sketch_size, d) so that
62 .. math:: \|Ax\| \approx \|A'x\|
64 with high probability via the Clarkson-Woodruff Transform, otherwise
65 known as the CountSketch matrix.
67 Parameters
68 ----------
69 input_matrix: array_like
70 Input matrix, of shape ``(n, d)``.
71 sketch_size: int
72 Number of rows for the sketch.
73 seed : None or int or `numpy.random.RandomState` instance, optional
74 This parameter defines the ``RandomState`` object to use for drawing
75 random variates.
76 If None (or ``np.random``), the global ``np.random`` state is used.
77 If integer, it is used to seed the local ``RandomState`` instance.
78 Default is None.
80 Returns
81 -------
82 A' : array_like
83 Sketch of the input matrix ``A``, of size ``(sketch_size, d)``.
85 Notes
86 -----
87 To make the statement
89 .. math:: \|Ax\| \approx \|A'x\|
91 precise, observe the following result which is adapted from the
92 proof of Theorem 14 of [2]_ via Markov's Inequality. If we have
93 a sketch size ``sketch_size=k`` which is at least
95 .. math:: k \geq \frac{2}{\epsilon^2\delta}
97 Then for any fixed vector ``x``,
99 .. math:: \|Ax\| = (1\pm\epsilon)\|A'x\|
101 with probability at least one minus delta.
103 This implementation takes advantage of sparsity: computing
104 a sketch takes time proportional to ``A.nnz``. Data ``A`` which
105 is in ``scipy.sparse.csc_matrix`` format gives the quickest
106 computation time for sparse input.
108 >>> from scipy import linalg
109 >>> from scipy import sparse
110 >>> n_rows, n_columns, density, sketch_n_rows = 15000, 100, 0.01, 200
111 >>> A = sparse.rand(n_rows, n_columns, density=density, format='csc')
112 >>> B = sparse.rand(n_rows, n_columns, density=density, format='csr')
113 >>> C = sparse.rand(n_rows, n_columns, density=density, format='coo')
114 >>> D = np.random.randn(n_rows, n_columns)
115 >>> SA = linalg.clarkson_woodruff_transform(A, sketch_n_rows) # fastest
116 >>> SB = linalg.clarkson_woodruff_transform(B, sketch_n_rows) # fast
117 >>> SC = linalg.clarkson_woodruff_transform(C, sketch_n_rows) # slower
118 >>> SD = linalg.clarkson_woodruff_transform(D, sketch_n_rows) # slowest
120 That said, this method does perform well on dense inputs, just slower
121 on a relative scale.
123 Examples
124 --------
125 Given a big dense matrix ``A``:
127 >>> from scipy import linalg
128 >>> n_rows, n_columns, sketch_n_rows = 15000, 100, 200
129 >>> A = np.random.randn(n_rows, n_columns)
130 >>> sketch = linalg.clarkson_woodruff_transform(A, sketch_n_rows)
131 >>> sketch.shape
132 (200, 100)
133 >>> norm_A = np.linalg.norm(A)
134 >>> norm_sketch = np.linalg.norm(sketch)
136 Now with high probability, the true norm ``norm_A`` is close to
137 the sketched norm ``norm_sketch`` in absolute value.
139 Similarly, applying our sketch preserves the solution to a linear
140 regression of :math:`\min \|Ax - b\|`.
142 >>> from scipy import linalg
143 >>> n_rows, n_columns, sketch_n_rows = 15000, 100, 200
144 >>> A = np.random.randn(n_rows, n_columns)
145 >>> b = np.random.randn(n_rows)
146 >>> x = np.linalg.lstsq(A, b, rcond=None)
147 >>> Ab = np.hstack((A, b.reshape(-1,1)))
148 >>> SAb = linalg.clarkson_woodruff_transform(Ab, sketch_n_rows)
149 >>> SA, Sb = SAb[:,:-1], SAb[:,-1]
150 >>> x_sketched = np.linalg.lstsq(SA, Sb, rcond=None)
152 As with the matrix norm example, ``np.linalg.norm(A @ x - b)``
153 is close to ``np.linalg.norm(A @ x_sketched - b)`` with high
154 probability.
156 References
157 ----------
158 .. [1] Kenneth L. Clarkson and David P. Woodruff. Low rank approximation and
159 regression in input sparsity time. In STOC, 2013.
161 .. [2] David P. Woodruff. Sketching as a tool for numerical linear algebra.
162 In Foundations and Trends in Theoretical Computer Science, 2014.
164 """
165 S = cwt_matrix(sketch_size, input_matrix.shape[0], seed)
166 return S.dot(input_matrix)