\[ \newcommand{\matrix}[1]{\mathbf{#1}} \] Let \(\matrix{A}\) be a data set of \(m\) points in \(\mathbb{R}^d\). One application of SVD is to create a compressed representation of \(\matrix{A}\). Rank-\(k\) approximation of \(A\) is created by calculating the singular value decomposition of \(\matrix{A}\) \[ \matrix{A} = \matrix{U}\matrix{\Sigma}{\matrix{V}} \] and reconstruct it with \(k \leq d\) first singular values. \[ \matrix{A_k} = \matrix{U_k}\matrix{\Sigma_k}\matrix{V_k^T} \]

SVD in R

Each implementation of SVD has some varieties in the output representation. For example, the right singular vectors \(\matrix{V}\) may or may not be be already transposed. In R, one can run svd(A) to obtain SVD, and get the output as below

##      [,1] [,2] [,3]
## [1,]    1    6   11
## [2,]    2    7   12
## [3,]    3    8   13
## [4,]    4    9   14
## [5,]    5   10   15
## $d
## [1] 3.512722e+01 2.465397e+00 1.269091e-15
## 
## $u
##            [,1]        [,2]        [,3]
## [1,] -0.3545571  0.68868664  0.57004417
## [2,] -0.3986964  0.37555453 -0.74547800
## [3,] -0.4428357  0.06242242 -0.17015793
## [4,] -0.4869750 -0.25070970  0.29657318
## [5,] -0.5311143 -0.56384181  0.04901858
## 
## $v
##            [,1]       [,2]       [,3]
## [1,] -0.2016649 -0.8903171  0.4082483
## [2,] -0.5168305 -0.2573316 -0.8164966
## [3,] -0.8319961  0.3756539  0.4082483

Although \(U\) should be a squared matrix, svd(A)$u is in fact an \(m\times d\) matrix. However, this truncation does not compromise the information in \(\matrix{A}\). Since all the rows \(d+1, d+2, \dots, m\) in \(\Sigma\) are 0, one can safely remove the columns \(d+1, d+2, \dots, m\) in \(\matrix{U}\).

Reconstruction

k.approx <- function(X,k) {
    X.svd = svd(X)
    D = diag(X.svd$d, nrow=k)
    return(X.svd$u[,1:k] %*%D %*% t(X.svd$v)[1:k,])
}
A = matrix(runif(1000),nrow=50)
residuals.rand = vary.k(A)
ggplot(residuals.rand,aes(x=k,y=residuals)) +
    geom_line(color='blue') +
    geom_point(color='blue') +
    xlab(expression(k)) +
    ylab(expression(group("|",A-A[k],"|")[F])) +
    theme_bw()

\(\matrix{A}\) is rank-1 matrix

\(\matrix{A}\) is rank-5 matrix

Every column in \(\matrix{A}\) is linearly dependent on the first five columns.

A = matrix(rep(0,1000),nrow=50)
A[,1:5] = matrix(runif(nrow(A)*5),ncol=5)
for (i in 6:ncol(A)) {
    A[,i] = A[,sample(5,size=1)]*runif(1) # + runif(nrow(A), min = -0.5,max=0.5) 
}

\(\matrix{A}\) is rank-5 matrix, with some noise

A = matrix(rep(0,1000),ncol=20)
A[,1:5] = matrix(runif(nrow(A)*5),ncol=5)
for (i in 6:ncol(A)) {
    A[,i] = A[,sample(5,size=1)]*runif(1) + runif(nrow(A), min=-1e-2, max=1e-2) # + runif(nrow(A), min = -0.5,max=0.5) 
}

Same rank matrices with different sizes

B = matrix(rep(0,400),ncol=20)
B[,1:5] = matrix(runif(nrow(B)*5),ncol=5)
for (i in 6:ncol(B)) {
    B[,i] = B[,sample(5,size=1)]*runif(1) + runif(nrow(B), min=-1e-2, max=1e-2) # + runif(nrow(A), min = -0.5,max=0.5) 
}

Characteristics of singular values

Magnitude

A.svd = svd(A)
B.svd = svd(B)
sv.df = data.frame('i' = 1:length(A.svd$d),'A' = A.svd$d, 'B'= B.svd$d)
ggplot(sv.df, aes(x=i)) +
    geom_line(aes(y=A), color='blue') +
    geom_point(aes(y=A), color='blue') +
    geom_line(aes(y=B), color='darkgreen') +
    geom_point(aes(y=B), color='darkgreen') +
    ylab(expression(S[i])) +
    theme_bw()

Relative contributions

\(B = \alpha A\)

mult.sv = data.frame('i' = 1:min(dim(A)))
for (const in 10^c(0,1,2,3,4)) {
    B = const * A
    ABdiff = (B/ norm(B,type='F')) - (A/norm(A,type='F'))
    diff.svd = svd(ABdiff)
    mult.sv[,as.character(const)] = diff.svd$d
}

mm = reshape2::melt(mult.sv,id.vars='i')
mm$variable = as.numeric(as.character(mm$variable))
ggplot(mm, aes(x=i,y=value,group=factor(variable),color=variable)) +
    geom_line() +
    ylab(expression(S[i])) +
    theme_bw()

\(B = const \times A + \epsilon\)

\[\epsilon \sim Normal(0,1)\]

dXY.p <- function(X,Y, lower.tail = TRUE) {
    d = svd(X-Y)$d
    d = d / sum(d)  # singular values are non-negative, no need to square
    E.d = 1/ length(d)
    # chisq = \sum(Obs_i - Exp_i)^2 / Exp_i
    chisq = sum( (d - E.d)^2 / E.d)
    return(pchisq(chisq, length(d) - 1,lower.tail = lower.tail))
}

mult.sv = data.frame('i' = 1:min(dim(A)))
factors = 10^c(0,1,2,3,4,5)
pVals = data.frame('MultiplicationFactor' = factors)
for (i in 1:length(factors)) {
    const = factors[i]
    B = const * A + rnorm(Reduce('*',dim(A)),mean = 0,sd = 0.1*const)
    ABdiff = (B/ norm(B,type='F')) - (A/norm(A,type='F'))
    diff.svd = svd(ABdiff)
    mult.sv[,as.character(const)] = diff.svd$d
    pVals[i,'p-value'] = dXY.p(A,B)
}
mm = reshape2::melt(mult.sv,id.vars='i')
names(mm) = c('rank', 'MultiplicationFactor', 'SingularValue')
mm[['MultiplicationFactor']] = as.numeric(as.character(mm$MultiplicationFactor))
ggplot(mm, aes(x=rank,y=SingularValue,group=factor(MultiplicationFactor),color=MultiplicationFactor)) +
    geom_line() +
    ylab(expression(S[i])) +
    theme_bw()

pVals
##   MultiplicationFactor      p-value
## 1                1e+00 7.664707e-18
## 2                1e+01 3.451641e-07
## 3                1e+02 7.642776e-07
## 4                1e+03 5.246422e-07
## 5                1e+04 7.753992e-07
## 6                1e+05 5.346436e-07