Source code for deepr.examples.movielens.jobs.svd

"""Build MovieLens dataset as TFRecords."""

import logging
from dataclasses import dataclass

import numpy as np

import deepr

try:
    import pandas as pd
except ImportError as e:
    print(f"Pandas needs to be installed for MovieLens {e}")

try:
    from scipy import sparse
except ImportError as e:
    print(f"Scipy needs to be installed for MovieLens {e}")


try:
    from sklearn.decomposition import TruncatedSVD
except ImportError as e:
    print(f"sklearn needs to be installed for MovieLens {e}")


LOGGER = logging.getLogger(__name__)


[docs]@dataclass
class SVD(deepr.jobs.Job):
    """Build SVD embeddings."""

    path_csv: str
    path_embeddings: str
    path_counts: str
    vocab_size: int
    dim: int = 600
    min_count: int = 10

[docs]    def run(self):
        # Read user-item matrix
        LOGGER.info(f"Reading user-item rankings from {self.path_csv}")
        with deepr.io.Path(self.path_csv).open() as file:
            tp = pd.read_csv(file)
        n_users = tp["uid"].max() + 1
        rows, cols = tp["uid"], tp["sid"]
        user_item = sparse.csr_matrix(
            (np.ones_like(rows), (rows, cols)), dtype="int64", shape=(n_users, self.vocab_size)
        )

        # Computing counts
        LOGGER.info(f"Saving counts to {self.path_counts}")
        item_counts = np.asarray(user_item.sum(axis=0)).flatten()
        with deepr.io.Path(self.path_counts).open("wb") as file:
            np.save(file, item_counts)

        # Computing co-occurrence matrix
        LOGGER.info("Computing co-occurrence matrix")
        item_item = compute_coocurrence(user_item, self.min_count)

        # Compute PMI from co-occurrence
        LOGGER.info("Computing PMI matrix")
        item_item = compute_pmi(item_item)

        # Compute Truncated SVD
        LOGGER.info("Computing Truncated SVD from PMI matrix")
        svd = TruncatedSVD(n_components=self.dim, algorithm="arpack", random_state=42)
        embeddings = svd.fit_transform(item_item)
        LOGGER.info(f"Explained variance: {svd.explained_variance_ratio_.sum()}")

        LOGGER.info(f"Saving embeddings to {self.path_embeddings}")
        deepr.io.Path(self.path_embeddings).parent.mkdir(parents=True, exist_ok=True)
        with deepr.io.Path(self.path_embeddings).open("wb") as file:
            np.save(file, embeddings)


[docs]def compute_coocurrence(user_item, min_count: int):
    """Compute co-occurrence matrix from user-item matrix."""
    # Compute co-occurrences via dot-product (all entries are 1 or 0)
    item_item = user_item.transpose().dot(user_item)

    # Set diagonal to zero
    item_item.setdiag(0)

    # Get indices to mask
    counts = item_item.sum(axis=-1).A1
    positive = counts > 0
    rare = counts < min_count
    mask = np.logical_and(positive, rare)
    indices = np.nonzero(mask)

    # Set rows / columns to zeros
    item_item = item_item.tolil()
    item_item[indices] = 0
    item_item = item_item.transpose()
    item_item[indices] = 0

    # Convert back to CSR format and remove zeros
    item_item = item_item.tocsr()
    item_item.eliminate_zeros()

    return item_item


[docs]def compute_pmi(matrix, cds: float = 0.75, additive_smoothing: float = 0.0, pmi_power: float = 1.0, k=1.0):
    """Compute PMI matrix from item-item matrix."""
    # Convert to COO format
    matrix_tocoo = matrix.tocoo()
    data = matrix_tocoo.data
    rows = matrix_tocoo.row
    cols = matrix_tocoo.col

    # Compute items counts
    left = np.array(matrix.sum(axis=1)).flatten()
    right = np.array(matrix.sum(axis=0)).flatten()

    # Compute total counts
    total_count = data.sum()
    smoothed_total_count = np.power(data, cds).sum()
    scaled_smoothing = np.power(additive_smoothing, cds)

    # Compute probabilities
    p_xy = np.power(data, pmi_power) / total_count
    p_x = (left[rows] + additive_smoothing) / (total_count + additive_smoothing)
    p_y_cds = np.power(right[cols] + additive_smoothing, cds) / (smoothed_total_count + scaled_smoothing)

    # Compute PMI and assemble into on CSR matrix
    data_pmi = np.maximum(np.log(p_xy) - np.log(p_x) - np.log(p_y_cds) - np.log(k), 0)
    pmi = sparse.csr_matrix((data_pmi, (rows, cols)), shape=matrix.shape)
    return pmi