Source code for lightfm.datasets.stackexchange

import os

import numpy as np

import scipy.sparse as sp

from lightfm.datasets import _common


[docs]def fetch_stackexchange(
    dataset,
    test_set_fraction=0.2,
    min_training_interactions=1,
    data_home=None,
    indicator_features=True,
    tag_features=False,
    download_if_missing=True,
):
    """
    Fetch a dataset from the `StackExchange network <http://stackexchange.com/>`_.

    The datasets contain users answering questions: an interaction is defined as a user
    answering a given question.

    The following datasets from the StackExchange network are available:

    - CrossValidated: From stats.stackexchange.com. Approximately 9000 users, 72000 questions,
      and 70000 answers.
    - StackOverflow: From stackoverflow.stackexchange.com. Approximately 1.3M users, 11M questions,
      and 18M answers.

    Parameters
    ----------

    dataset: string, one of ('crossvalidated', 'stackoverflow')
        The part of the StackExchange network for which to fetch the dataset.
    test_set_fraction: float, optional
        The fraction of the dataset used for testing. Splitting into the train and test set is done
        in a time-based fashion: all interactions before a certain time are in the train set and
        all interactions after that time are in the test set.
    min_training_interactions: int, optional
        Only include users with this amount of interactions in the training set.
    data_home: path, optional
        Path to the directory in which the downloaded data should be placed.
        Defaults to ``~/lightfm_data/``.
    indicator_features: bool, optional
        Use an [n_users, n_users] identity matrix for item features. When True with genre_features,
        indicator and genre features are concatenated into a single feature matrix of shape
        [n_users, n_users + n_genres].
    download_if_missing: bool, optional
        Download the data if not present. Raises an IOError if False and data is missing.

    Notes
    -----

    The return value is a dictionary containing the following keys:

    Returns
    -------

    train: sp.coo_matrix of shape [n_users, n_items]
         Contains training set interactions.
    test: sp.coo_matrix of shape [n_users, n_items]
         Contains testing set interactions.
    item_features: sp.csr_matrix of shape [n_items, n_item_features]
         Contains item features.
    item_feature_labels: np.array of strings of shape [n_item_features,]
         Labels of item features.
    """

    if not (indicator_features or tag_features):
        raise ValueError(
            "At least one of item_indicator_features " "or tag_features must be True"
        )

    if dataset not in ("crossvalidated", "stackoverflow"):
        raise ValueError("Unknown dataset")

    if not (0.0 < test_set_fraction < 1.0):
        raise ValueError("Test set fraction must be between 0 and 1")

    urls = {
        "crossvalidated": (
            "https://github.com/maciejkula/lightfm_datasets/releases/"
            "download/v0.1.0/stackexchange_crossvalidated.npz"
        ),
        "stackoverflow": (
            "https://github.com/maciejkula/lightfm_datasets/releases/"
            "download/v0.1.0/stackexchange_stackoverflow.npz"
        ),
    }

    path = _common.get_data(
        data_home,
        urls[dataset],
        os.path.join("stackexchange", dataset),
        "data.npz",
        download_if_missing,
    )

    data = np.load(path)

    interactions = sp.coo_matrix(
        (
            data["interactions_data"],
            (data["interactions_row"], data["interactions_col"]),
        ),
        shape=data["interactions_shape"].flatten(),
    )
    interactions.sum_duplicates()

    tag_features_mat = sp.coo_matrix(
        (data["features_data"], (data["features_row"], data["features_col"])),
        shape=data["features_shape"].flatten(),
    )
    tag_labels = data["labels"]

    test_cutoff_index = int(len(interactions.data) * (1.0 - test_set_fraction))
    test_cutoff_timestamp = np.sort(interactions.data)[test_cutoff_index]
    in_train = interactions.data < test_cutoff_timestamp
    in_test = np.logical_not(in_train)

    train = sp.coo_matrix(
        (
            np.ones(in_train.sum(), dtype=np.float32),
            (interactions.row[in_train], interactions.col[in_train]),
        ),
        shape=interactions.shape,
    )
    test = sp.coo_matrix(
        (
            np.ones(in_test.sum(), dtype=np.float32),
            (interactions.row[in_test], interactions.col[in_test]),
        ),
        shape=interactions.shape,
    )

    if min_training_interactions > 0:
        include = np.squeeze(np.array(train.getnnz(axis=1))) > min_training_interactions

        train = train.tocsr()[include].tocoo()
        test = test.tocsr()[include].tocoo()

    if indicator_features and not tag_features:
        features = sp.identity(train.shape[1], format="csr", dtype=np.float32)
        labels = np.array(["question_id:{}".format(x) for x in range(train.shape[1])])
    elif not indicator_features and tag_features:
        features = tag_features_mat.tocsr()
        labels = tag_labels
    else:
        id_features = sp.identity(train.shape[1], format="csr", dtype=np.float32)
        features = sp.hstack([id_features, tag_features_mat]).tocsr()
        labels = np.concatenate(
            [
                np.array(["question_id:{}".format(x) for x in range(train.shape[1])]),
                tag_labels,
            ]
        )

    return {
        "train": train,
        "test": test,
        "item_features": features,
        "item_feature_labels": labels,
    }