"""Build MovieLens dataset as TFRecords."""
import logging
from dataclasses import dataclass
import numpy as np
import deepr
try:
import pandas as pd
except ImportError as e:
print(f"Pandas needs to be installed for MovieLens {e}")
try:
from scipy import sparse
except ImportError as e:
print(f"Scipy needs to be installed for MovieLens {e}")
try:
from sklearn.decomposition import TruncatedSVD
except ImportError as e:
print(f"sklearn needs to be installed for MovieLens {e}")
LOGGER = logging.getLogger(__name__)
[docs]@dataclass
class SVD(deepr.jobs.Job):
"""Build SVD embeddings."""
path_csv: str
path_embeddings: str
path_counts: str
vocab_size: int
dim: int = 600
min_count: int = 10
[docs] def run(self):
# Read user-item matrix
LOGGER.info(f"Reading user-item rankings from {self.path_csv}")
with deepr.io.Path(self.path_csv).open() as file:
tp = pd.read_csv(file)
n_users = tp["uid"].max() + 1
rows, cols = tp["uid"], tp["sid"]
user_item = sparse.csr_matrix(
(np.ones_like(rows), (rows, cols)), dtype="int64", shape=(n_users, self.vocab_size)
)
# Computing counts
LOGGER.info(f"Saving counts to {self.path_counts}")
item_counts = np.asarray(user_item.sum(axis=0)).flatten()
with deepr.io.Path(self.path_counts).open("wb") as file:
np.save(file, item_counts)
# Computing co-occurrence matrix
LOGGER.info("Computing co-occurrence matrix")
item_item = compute_coocurrence(user_item, self.min_count)
# Compute PMI from co-occurrence
LOGGER.info("Computing PMI matrix")
item_item = compute_pmi(item_item)
# Compute Truncated SVD
LOGGER.info("Computing Truncated SVD from PMI matrix")
svd = TruncatedSVD(n_components=self.dim, algorithm="arpack", random_state=42)
embeddings = svd.fit_transform(item_item)
LOGGER.info(f"Explained variance: {svd.explained_variance_ratio_.sum()}")
LOGGER.info(f"Saving embeddings to {self.path_embeddings}")
deepr.io.Path(self.path_embeddings).parent.mkdir(parents=True, exist_ok=True)
with deepr.io.Path(self.path_embeddings).open("wb") as file:
np.save(file, embeddings)
[docs]def compute_coocurrence(user_item, min_count: int):
"""Compute co-occurrence matrix from user-item matrix."""
# Compute co-occurrences via dot-product (all entries are 1 or 0)
item_item = user_item.transpose().dot(user_item)
# Set diagonal to zero
item_item.setdiag(0)
# Get indices to mask
counts = item_item.sum(axis=-1).A1
positive = counts > 0
rare = counts < min_count
mask = np.logical_and(positive, rare)
indices = np.nonzero(mask)
# Set rows / columns to zeros
item_item = item_item.tolil()
item_item[indices] = 0
item_item = item_item.transpose()
item_item[indices] = 0
# Convert back to CSR format and remove zeros
item_item = item_item.tocsr()
item_item.eliminate_zeros()
return item_item
[docs]def compute_pmi(matrix, cds: float = 0.75, additive_smoothing: float = 0.0, pmi_power: float = 1.0, k=1.0):
"""Compute PMI matrix from item-item matrix."""
# Convert to COO format
matrix_tocoo = matrix.tocoo()
data = matrix_tocoo.data
rows = matrix_tocoo.row
cols = matrix_tocoo.col
# Compute items counts
left = np.array(matrix.sum(axis=1)).flatten()
right = np.array(matrix.sum(axis=0)).flatten()
# Compute total counts
total_count = data.sum()
smoothed_total_count = np.power(data, cds).sum()
scaled_smoothing = np.power(additive_smoothing, cds)
# Compute probabilities
p_xy = np.power(data, pmi_power) / total_count
p_x = (left[rows] + additive_smoothing) / (total_count + additive_smoothing)
p_y_cds = np.power(right[cols] + additive_smoothing, cds) / (smoothed_total_count + scaled_smoothing)
# Compute PMI and assemble into on CSR matrix
data_pmi = np.maximum(np.log(p_xy) - np.log(p_x) - np.log(p_y_cds) - np.log(k), 0)
pmi = sparse.csr_matrix((data_pmi, (rows, cols)), shape=matrix.shape)
return pmi