Singular Value Decomposition#

Singular Value Decomposition is explored on details here.

[1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
[2]:
ratings = pd.read_csv('/opt/datasetsRepo/RecommendationData/ratings.csv')
ratings.head()
[2]:
userId movieId rating timestamp
0 1 1 4.0 964982703
1 1 3 4.0 964981247
2 1 6 4.0 964982224
3 1 47 5.0 964983815
4 1 50 5.0 964982931
[243]:
movies = pd.read_csv('/opt/datasetsRepo/RecommendationData/movies.csv')
movies.head(5)
[243]:
movieId title genres
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy
1 2 Jumanji (1995) Adventure|Children|Fantasy
2 3 Grumpier Old Men (1995) Comedy|Romance
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance
4 5 Father of the Bride Part II (1995) Comedy
[3]:
idx_to_userid_mapper = dict(enumerate(ratings.userId.unique()))
userid_to_idx_mapper = dict(zip(idx_to_userid_mapper.values(), idx_to_userid_mapper.keys()))

idx_to_movieid_mapper = dict(enumerate(ratings.movieId.unique()))
movieid_to_idx_mapper = dict(zip(idx_to_movieid_mapper.values(), idx_to_movieid_mapper.keys()))

ratings['user_idx'] = ratings['userId'].map(userid_to_idx_mapper).apply(np.int32)
ratings['movie_idx'] = ratings['movieId'].map(movieid_to_idx_mapper).apply(np.int32)
ratings.head(5)
[3]:
userId movieId rating timestamp user_idx movie_idx
0 1 1 4.0 964982703 0 0
1 1 3 4.0 964981247 0 1
2 1 6 4.0 964982224 0 2
3 1 47 5.0 964983815 0 3
4 1 50 5.0 964982931 0 4
[251]:
movies['movie_idx'] = movies['movieId'].map(movieid_to_idx_mapper).dropna()
movies.head(5)
[251]:
movieId title genres movie_idx
0 1 Toy Story (1995) Adventure|Animation|Children|Comedy|Fantasy 0.0
1 2 Jumanji (1995) Adventure|Children|Fantasy 481.0
2 3 Grumpier Old Men (1995) Comedy|Romance 1.0
3 4 Waiting to Exhale (1995) Comedy|Drama|Romance 482.0
4 5 Father of the Bride Part II (1995) Comedy 483.0

non mapped movies. i.e. non rated movie by any user.

[474]:
movies[movies.movie_idx.isna()].shape
[474]:
(18, 4)

User Movie Matrix#

[32]:
user_movie_matrix = ratings.pivot_table(values=['rating'] ,index=['user_idx'], columns=['movie_idx'])
user_movie_matrix.head(5)
[32]:
rating
movie_idx 0 1 2 3 4 5 6 7 8 9 ... 9714 9715 9716 9717 9718 9719 9720 9721 9722 9723
user_idx
0 4.0 4.0 4.0 5.0 5.0 3.0 5.0 4.0 5.0 5.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 NaN NaN NaN 2.0 NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 4.0 NaN NaN NaN 4.0 NaN NaN 4.0 NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

5 rows × 9724 columns

[34]:
X = user_movie_matrix.fillna(0).values

Mask for NaN values#

[441]:
M = ~np.isnan(user_movie_matrix.values)
M
[441]:
array([[ True,  True,  True, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [ True,  True, False, ..., False, False, False],
       [ True, False, False, ..., False, False, False],
       [ True, False,  True, ...,  True,  True,  True]])
[35]:
X.shape
[35]:
(610, 9724)

SVD Calculation#

[36]:
U, S, VT = np.linalg.svd(X, full_matrices=False)
U.shape, S.shape, VT.shape
[36]:
((610, 610), (610,), (610, 9724))

Eigen values plot#

[435]:
deciles = (np.linspace(0,1,11)*100).astype('int')
deciles
[435]:
array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])
[436]:
dims = np.linspace(1, len(S), 11).astype('int')
dims
[436]:
array([  1,  61, 122, 183, 244, 305, 366, 427, 488, 549, 610])
[458]:
fig, ax = plt.subplots(1, 2, figsize=(10, 5))

eig_value_coverage = S.cumsum()/S.sum()
coverage = np.round(eig_value_coverage[dims-1]*100, 3)
table = np.c_[coverage, dims]

ax[0].plot(eig_value_coverage, ".-", color='k')
ax[0].hlines(deciles/100, xmin=0, xmax=len(S), alpha=0.8, color='k', linestyle='--')
ax[0].vlines(dims, ymin=0, ymax=1, alpha=0.8, color='r', linestyle='--')

ax[0].set_xticks(dims)
ax[0].set_yticks(deciles/100)
ax[0].grid()

ax[1].table(cellText=table, colLabels=['coverage', 'dimensions'], loc='center')
ax[1].axis('off')

plt.show()
../_images/notebooks_svd_20_0.png

Low Rank Matrix#

[443]:
def low_rank_matrix(U, S, VT, rank):
    X_tilde = U[:,:rank] @ np.diag(S[:rank]) @ VT[:rank, :]
    return X_tilde

Loss : RMSE#

[452]:
def loss(X, U, S, VT, rank, M):
    X_tilde = low_rank_matrix(U, S, VT, rank)
    return np.sqrt(np.square(X - X_tilde, where=M).sum())
[459]:
l_losses = []

ranks = [2, 10, 20, 30, 50, 100, 150, 200, 300, 600]
for i in ranks:
    l_losses.append(loss(X, U, S, VT, i, M))

fig, ax = plt.subplots(1, 1, figsize=(8,5))

ax.plot(ranks, l_losses, 'ko-')

ax.grid()
plt.show()
../_images/notebooks_svd_25_0.png

single prediction#

[460]:
def get_prediction(user_idx, movie_idx, U, S, VT, rank):
    return U[user_idx,:rank] @ np.diag(S[:rank]) @ VT[: rank, movie_idx]
[461]:
get_prediction(0, 0, U, S, VT, 2)
[461]:
2.4850892043557016

user idx based top non watched/rated movie recommendations#

[462]:
user_movie_matrix.index
[462]:
Int64Index([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,
            ...
            600, 601, 602, 603, 604, 605, 606, 607, 608, 609],
           dtype='int64', name='user_idx', length=610)
[469]:
user_idx = 10
rank = 100
[476]:
user_vector = user_movie_matrix.iloc[user_idx].values
user_vector
[476]:
array([nan, nan,  5., ..., nan, nan, nan])
[478]:
non_rated_movies_idx = np.isnan(user_vector)
non_rated_movies_idx
[478]:
array([ True,  True, False, ...,  True,  True,  True])
[479]:
all_movies_ratings = get_prediction(user_idx, ..., U, S, VT, rank)
all_movies_ratings
[479]:
array([ 1.17833575, -0.08102169,  1.9669552 , ...,  0.01067371,
        0.01245267,  0.01245267])
[480]:
top_n_movies = 5
[481]:
top_n_idxs = np.c_[np.argsort(all_movies_ratings)][non_rated_movies_idx][::-1][:top_n_movies, 0]
top_n_idxs
[481]:
array([  7,  20,  25, 463,  34])
[482]:
movies[movies['movie_idx'].isin(top_n_idxs)]
[482]:
movieId title genres movie_idx
97 110 Braveheart (1995) Action|Drama|War 7.0
123 150 Apollo 13 (1995) Adventure|Drama|IMAX 463.0
314 356 Forrest Gump (1994) Comedy|Drama|Romance|War 20.0
398 457 Fugitive, The (1993) Thriller 25.0
510 593 Silence of the Lambs, The (1991) Crime|Horror|Thriller 34.0