Singular Value Decomposition

Contents

Singular Value Decomposition#

Singular Value Decomposition is explored on details here.

Machine Learning Exploration : Singular Value Decomposition

https://machinelearningexploration.readthedocs.io/en/latest/MathExploration/SingularValueDecomposition.html

[1]:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

[2]:

ratings = pd.read_csv('/opt/datasetsRepo/RecommendationData/ratings.csv')
ratings.head()

[2]:

	userId	movieId	rating	timestamp
0	1	1	4.0	964982703
1	1	3	4.0	964981247
2	1	6	4.0	964982224
3	1	47	5.0	964983815
4	1	50	5.0	964982931

[243]:

movies = pd.read_csv('/opt/datasetsRepo/RecommendationData/movies.csv')
movies.head(5)

[243]:

	movieId	title	genres
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy
2	3	Grumpier Old Men (1995)	Comedy\|Romance
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance
4	5	Father of the Bride Part II (1995)	Comedy

[3]:

idx_to_userid_mapper = dict(enumerate(ratings.userId.unique()))
userid_to_idx_mapper = dict(zip(idx_to_userid_mapper.values(), idx_to_userid_mapper.keys()))

idx_to_movieid_mapper = dict(enumerate(ratings.movieId.unique()))
movieid_to_idx_mapper = dict(zip(idx_to_movieid_mapper.values(), idx_to_movieid_mapper.keys()))

ratings['user_idx'] = ratings['userId'].map(userid_to_idx_mapper).apply(np.int32)
ratings['movie_idx'] = ratings['movieId'].map(movieid_to_idx_mapper).apply(np.int32)
ratings.head(5)

[3]:

	userId	movieId	rating	timestamp	user_idx	movie_idx
0	1	1	4.0	964982703	0	0
1	1	3	4.0	964981247	0	1
2	1	6	4.0	964982224	0	2
3	1	47	5.0	964983815	0	3
4	1	50	5.0	964982931	0	4

[251]:

movies['movie_idx'] = movies['movieId'].map(movieid_to_idx_mapper).dropna()
movies.head(5)

[251]:

	movieId	title	genres	movie_idx
0	1	Toy Story (1995)	Adventure\|Animation\|Children\|Comedy\|Fantasy	0.0
1	2	Jumanji (1995)	Adventure\|Children\|Fantasy	481.0
2	3	Grumpier Old Men (1995)	Comedy\|Romance	1.0
3	4	Waiting to Exhale (1995)	Comedy\|Drama\|Romance	482.0
4	5	Father of the Bride Part II (1995)	Comedy	483.0

non mapped movies. i.e. non rated movie by any user.

[474]:

movies[movies.movie_idx.isna()].shape

[474]:

(18, 4)

User Movie Matrix#

[32]:

user_movie_matrix = ratings.pivot_table(values=['rating'] ,index=['user_idx'], columns=['movie_idx'])
user_movie_matrix.head(5)

[32]:

	rating
movie_idx	0	1	2	3	4	5	6	7	8	9	...	9714	9715	9716	9717	9718	9719	9720	9721	9722	9723
user_idx
0	4.0	4.0	4.0	5.0	5.0	3.0	5.0	4.0	5.0	5.0	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	2.0	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	4.0	NaN	NaN	NaN	4.0	NaN	NaN	4.0	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 9724 columns

[34]:

X = user_movie_matrix.fillna(0).values

Mask for NaN values#

[441]:

M = ~np.isnan(user_movie_matrix.values)
M

[441]:

array([[ True,  True,  True, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [ True,  True, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [ True, False,  True, ...,  True,  True,  True]])

[35]:

X.shape

[35]:

(610, 9724)

SVD Calculation#

[36]:

U, S, VT = np.linalg.svd(X, full_matrices=False)
U.shape, S.shape, VT.shape

[36]:

((610, 610), (610,), (610, 9724))

Eigen values plot#

[435]:

deciles = (np.linspace(0,1,11)*100).astype('int')
deciles

[435]:

array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

[436]:

dims = np.linspace(1, len(S), 11).astype('int')
dims

[436]:

array([  1,  61, 122, 183, 244, 305, 366, 427, 488, 549, 610])

[458]:

fig, ax = plt.subplots(1, 2, figsize=(10, 5))

eig_value_coverage = S.cumsum()/S.sum()
coverage = np.round(eig_value_coverage[dims-1]*100, 3)
table = np.c_[coverage, dims]

ax[0].plot(eig_value_coverage, ".-", color='k')
ax[0].hlines(deciles/100, xmin=0, xmax=len(S), alpha=0.8, color='k', linestyle='--')
ax[0].vlines(dims, ymin=0, ymax=1, alpha=0.8, color='r', linestyle='--')

ax[0].set_xticks(dims)
ax[0].set_yticks(deciles/100)
ax[0].grid()

ax[1].table(cellText=table, colLabels=['coverage', 'dimensions'], loc='center')
ax[1].axis('off')

plt.show()

../_images/notebooks_svd_20_0.png

Low Rank Matrix#

[443]:

def low_rank_matrix(U, S, VT, rank):
    X_tilde = U[:,:rank] @ np.diag(S[:rank]) @ VT[:rank, :]
    return X_tilde

Loss : RMSE#

[452]:

def loss(X, U, S, VT, rank, M):
    X_tilde = low_rank_matrix(U, S, VT, rank)
    return np.sqrt(np.square(X - X_tilde, where=M).sum())

[459]:

l_losses = []

ranks = [2, 10, 20, 30, 50, 100, 150, 200, 300, 600]
for i in ranks:
    l_losses.append(loss(X, U, S, VT, i, M))

fig, ax = plt.subplots(1, 1, figsize=(8,5))

ax.plot(ranks, l_losses, 'ko-')

ax.grid()
plt.show()

../_images/notebooks_svd_25_0.png

single prediction#

[460]:

def get_prediction(user_idx, movie_idx, U, S, VT, rank):
    return U[user_idx,:rank] @ np.diag(S[:rank]) @ VT[: rank, movie_idx]

[461]:

get_prediction(0, 0, U, S, VT, 2)

[461]:

2.4850892043557016

user idx based top non watched/rated movie recommendations#

[462]:

user_movie_matrix.index

[462]:

Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            600, 601, 602, 603, 604, 605, 606, 607, 608, 609],
           dtype='int64', name='user_idx', length=610)

[469]:

user_idx = 10
rank = 100

[476]:

user_vector = user_movie_matrix.iloc[user_idx].values
user_vector

[476]:

array([nan, nan,  5., ..., nan, nan, nan])

[478]:

non_rated_movies_idx = np.isnan(user_vector)
non_rated_movies_idx

[478]:

array([ True,  True, False, ...,  True,  True,  True])

[479]:

all_movies_ratings = get_prediction(user_idx, ..., U, S, VT, rank)
all_movies_ratings

[479]:

array([ 1.17833575, -0.08102169,  1.9669552 , ...,  0.01067371,
        0.01245267,  0.01245267])

[480]:

top_n_movies = 5

[481]:

top_n_idxs = np.c_[np.argsort(all_movies_ratings)][non_rated_movies_idx][::-1][:top_n_movies, 0]
top_n_idxs

[481]:

array([  7,  20,  25, 463,  34])

[482]:

movies[movies['movie_idx'].isin(top_n_idxs)]

[482]:

	movieId	title	genres	movie_idx
97	110	Braveheart (1995)	Action\|Drama\|War	7.0
123	150	Apollo 13 (1995)	Adventure\|Drama\|IMAX	463.0
314	356	Forrest Gump (1994)	Comedy\|Drama\|Romance\|War	20.0
398	457	Fugitive, The (1993)	Thriller	25.0
510	593	Silence of the Lambs, The (1991)	Crime\|Horror\|Thriller	34.0