Skip to content

prediction_matrix

logger = logging.getLogger(__name__) module-attribute

PredictionMatrix

Bases: InteractionMatrix

Source code in src/recnexteval/matrix/prediction_matrix.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
class PredictionMatrix(InteractionMatrix):
    @classmethod
    def from_interaction_matrix(cls, im: InteractionMatrix) -> "PredictionMatrix":
        """Create a PredictionMatrix from an InteractionMatrix.

        :param im: The InteractionMatrix to convert.
        :type im: InteractionMatrix
        :return: A new PredictionMatrix with the same data.
        :rtype: PredictionMatrix
        """
        return cls(
            df=im._df,
            item_ix=im.ITEM_IX,
            user_ix=im.USER_IX,
            timestamp_ix=im.TIMESTAMP_IX,
            shape=getattr(im, "shape", None),
            skip_df_processing=True,
        )

    def filter_for_predict(self) -> Self:
        """Get the data to be predicted.

        :return: InteractionMatrix with only the data to be predicted.
        :rtype: InteractionMatrix
        """
        return self.items_in({-1})

    def mask_user_item_shape(self, shape: tuple[int, int]) -> None:
        """Masks global user and item ID.

        To ensure released matrix released to the models only contains data
        that is intended to be released. This addresses the data leakage issue.
        It is recommended that the programmer defines the shape of the matrix
        such that the model only sees the data that is intended to be seen.

        =======
        Example
        =======

        Given the following case where the data is as follows::

            > uid: [0, 1, 2, 3, 4, 5]
            > iid: [0, 1, 2, 3, -1, -1]
            > ts : [0, 1, 2, 3, 4, 6]

        Where user 4, 5 is the user to be predicted. Assuming that user 4, 5 is an
        unknown user, that is, the model has never seen user 4, 5 before. The shape
        of the matrix should be (4, 4). This should be defined when calling the
        function in :param:`shape`.

        If the shape is defined, and it contains ID of unknown user/item, a warning
        will be raised if :attr:`drop_unknown` is set to False. If :attr:`drop_unknown`
        is set to True, the unknown user/item will be dropped from the data. All
        user/item ID greater than `shape[0]` will be dropped. This follows from
        the initial assumption that the user/item ID starts from 0 as defined in
        the dataset class.

        Else, in the event that :param:`shape` is not defined, the shape will be
        inferred from the data. The shape will be determined by the number of
        unique users/items. In this case the shape will be (5, 4). Note that the
        shape may not be as intended by the programmer if the data contains
        unknown users/items or if the dataframe does not contain all historical
        users/items.

        :param shape: Shape of the known user and item base. This value is
            usually set by the evaluator during the evaluation run. This value
            can also be set manually but the programmer if there is a need to
            alter the known user/item base. Defaults to None
        :type shape: Optional[tuple[int, int]], optional
        :param drop_unknown_user: To drop unknown users in the dataset,
            defaults to False
        :type drop_unknown_user: bool, optional
        :param drop_unknown_item: To drop unknown items in the dataset,
            defaults to False
        :type drop_unknown_item: bool, optional
        :param inherit_max_id: To inherit the maximum user and item ID from the
            given shape and the dataframe. This is useful when the shape is
            defined and the dataframe contains unknown users/items. Defaults to False
        :type inherit_max_id: bool, optional
        """

        logger.debug(
            f"(user x item) shape defined is {shape}. Shape of dataframe stored in matrix was {self._df.shape} before masking"
        )
        self.user_item_shape = shape
        logger.debug(f"Final (user x item) shape defined is {self.user_item_shape}")
        self._check_user_item_shape()

    def _check_user_item_shape(self) -> None:
        if not hasattr(self, "user_item_shape"):
            raise AttributeError(
                "InteractionMatrix has no `user_item_shape` attribute. Please call mask_shape() first."
            )
        if self.user_item_shape[0] is None or self.user_item_shape[1] is None:
            raise ValueError("Shape must be defined.")

        valid_df = self._df[self._df != -1]
        req_rows = valid_df[InteractionMatrix.USER_IX].max()
        req_cols = np.nan_to_num(valid_df[InteractionMatrix.ITEM_IX].max(), nan=-1)

        if self.user_item_shape[0] < req_rows or self.user_item_shape[1] < req_cols:
            logger.warning(
                "InteractionMatrix shape mismatch detected. "
                "Current shape: %s. Required minimum: (%s, %s). "
                "Data loss may occur.",
                self.user_item_shape,
                req_rows,
                req_cols,
            )
            warn(
                "Provided shape does not match known id; there are id that are out of bounds. "
                "Call mask_shape(drop_unknown=True) to drop unknown users and items.",
                category=UserWarning,
                stacklevel=2,
            )

ITEM_IX = 'iid' class-attribute instance-attribute

USER_IX = 'uid' class-attribute instance-attribute

TIMESTAMP_IX = 'ts' class-attribute instance-attribute

INTERACTION_IX = 'interactionid' class-attribute instance-attribute

MASKED_LABEL = -1 class-attribute instance-attribute

user_item_shape instance-attribute

The shape of the interaction matrix, i.e. |user| x |item|.

values property

All user-item interactions as a sparse matrix of size (|users|, |items|).

The shape of the matrix is determined by the user_item_shape attribute. Each row represents a user and each column represents an item. The index of the rows and columns correspond to the user and item IDs respectively. An entry in the matrix is 1 if there is an interaction.

indices property

Returns a tuple of lists of user IDs and item IDs corresponding to interactions.

:return: tuple of lists of user IDs and item IDs that correspond to at least one interaction. :rtype: tuple[list[int], list[int]]

item_interaction_sequence_matrix property

Converts the interaction data into an item interaction sequence matrix.

Dataframe values are converted into such that the row sequence is maintained and the item that interacted with will have the column at item_id marked with 1.

user_id_sequence_array property

Array of user IDs in the order of interactions.

:return: Numpy array of user IDs. :rtype: np.ndarray

user_ids property

The set of all user ID in matrix

item_ids property

The set of all item ID in matrix

num_interactions property

The total number of interactions.

:return: Total interaction count. :rtype: int

has_timestamps property

Boolean indicating whether instance has timestamp information.

:return: True if timestamps information is available, False otherwise. :rtype: bool

min_timestamp property

The earliest timestamp in the interaction

:return: The earliest timestamp. :rtype: int

max_timestamp property

The latest timestamp in the interaction

:return: The latest timestamp. :rtype: int

global_num_user property

global_num_item property

known_num_user property

The highest known number of users

Note that we add 1 to the max known user ID to get the number of users, since user IDs are zero-indexed.

known_num_item property

The highest known user ID in the interaction matrix.

max_user_id property

The highest known user ID in the interaction matrix.

:return: The highest user ID. :rtype: int

max_item_id property

The highest known item ID in the interaction matrix.

In the case of an empty matrix, the highest item ID is -1. This is consistent with the the definition that -1 denotes the item that is unknown. It would be incorrect to use any other value, since 0 is a valid item ID.

:return: The highest item ID. :rtype: int

timestamps property

Timestamps of interactions as a pandas Series, indexed by user ID and item ID.

:raises TimestampAttributeMissingError: If timestamp column is missing. :return: Interactions with composite index on (user ID, item ID) :rtype: pd.Series

latest_interaction_timestamps_matrix property

A csr matrix containing the last interaction timestamp for each user, item pair.

We only account for the last interacted timestamp making the dataset non-deduplicated.

from_interaction_matrix(im) classmethod

Create a PredictionMatrix from an InteractionMatrix.

:param im: The InteractionMatrix to convert. :type im: InteractionMatrix :return: A new PredictionMatrix with the same data. :rtype: PredictionMatrix

Source code in src/recnexteval/matrix/prediction_matrix.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
@classmethod
def from_interaction_matrix(cls, im: InteractionMatrix) -> "PredictionMatrix":
    """Create a PredictionMatrix from an InteractionMatrix.

    :param im: The InteractionMatrix to convert.
    :type im: InteractionMatrix
    :return: A new PredictionMatrix with the same data.
    :rtype: PredictionMatrix
    """
    return cls(
        df=im._df,
        item_ix=im.ITEM_IX,
        user_ix=im.USER_IX,
        timestamp_ix=im.TIMESTAMP_IX,
        shape=getattr(im, "shape", None),
        skip_df_processing=True,
    )

filter_for_predict()

Get the data to be predicted.

:return: InteractionMatrix with only the data to be predicted. :rtype: InteractionMatrix

Source code in src/recnexteval/matrix/prediction_matrix.py
33
34
35
36
37
38
39
def filter_for_predict(self) -> Self:
    """Get the data to be predicted.

    :return: InteractionMatrix with only the data to be predicted.
    :rtype: InteractionMatrix
    """
    return self.items_in({-1})

mask_user_item_shape(shape)

Masks global user and item ID.

To ensure released matrix released to the models only contains data that is intended to be released. This addresses the data leakage issue. It is recommended that the programmer defines the shape of the matrix such that the model only sees the data that is intended to be seen.

======= Example =======

Given the following case where the data is as follows::

> uid: [0, 1, 2, 3, 4, 5]
> iid: [0, 1, 2, 3, -1, -1]
> ts : [0, 1, 2, 3, 4, 6]

Where user 4, 5 is the user to be predicted. Assuming that user 4, 5 is an unknown user, that is, the model has never seen user 4, 5 before. The shape of the matrix should be (4, 4). This should be defined when calling the function in :param:shape.

If the shape is defined, and it contains ID of unknown user/item, a warning will be raised if :attr:drop_unknown is set to False. If :attr:drop_unknown is set to True, the unknown user/item will be dropped from the data. All user/item ID greater than shape[0] will be dropped. This follows from the initial assumption that the user/item ID starts from 0 as defined in the dataset class.

Else, in the event that :param:shape is not defined, the shape will be inferred from the data. The shape will be determined by the number of unique users/items. In this case the shape will be (5, 4). Note that the shape may not be as intended by the programmer if the data contains unknown users/items or if the dataframe does not contain all historical users/items.

:param shape: Shape of the known user and item base. This value is usually set by the evaluator during the evaluation run. This value can also be set manually but the programmer if there is a need to alter the known user/item base. Defaults to None :type shape: Optional[tuple[int, int]], optional :param drop_unknown_user: To drop unknown users in the dataset, defaults to False :type drop_unknown_user: bool, optional :param drop_unknown_item: To drop unknown items in the dataset, defaults to False :type drop_unknown_item: bool, optional :param inherit_max_id: To inherit the maximum user and item ID from the given shape and the dataframe. This is useful when the shape is defined and the dataframe contains unknown users/items. Defaults to False :type inherit_max_id: bool, optional

Source code in src/recnexteval/matrix/prediction_matrix.py
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def mask_user_item_shape(self, shape: tuple[int, int]) -> None:
    """Masks global user and item ID.

    To ensure released matrix released to the models only contains data
    that is intended to be released. This addresses the data leakage issue.
    It is recommended that the programmer defines the shape of the matrix
    such that the model only sees the data that is intended to be seen.

    =======
    Example
    =======

    Given the following case where the data is as follows::

        > uid: [0, 1, 2, 3, 4, 5]
        > iid: [0, 1, 2, 3, -1, -1]
        > ts : [0, 1, 2, 3, 4, 6]

    Where user 4, 5 is the user to be predicted. Assuming that user 4, 5 is an
    unknown user, that is, the model has never seen user 4, 5 before. The shape
    of the matrix should be (4, 4). This should be defined when calling the
    function in :param:`shape`.

    If the shape is defined, and it contains ID of unknown user/item, a warning
    will be raised if :attr:`drop_unknown` is set to False. If :attr:`drop_unknown`
    is set to True, the unknown user/item will be dropped from the data. All
    user/item ID greater than `shape[0]` will be dropped. This follows from
    the initial assumption that the user/item ID starts from 0 as defined in
    the dataset class.

    Else, in the event that :param:`shape` is not defined, the shape will be
    inferred from the data. The shape will be determined by the number of
    unique users/items. In this case the shape will be (5, 4). Note that the
    shape may not be as intended by the programmer if the data contains
    unknown users/items or if the dataframe does not contain all historical
    users/items.

    :param shape: Shape of the known user and item base. This value is
        usually set by the evaluator during the evaluation run. This value
        can also be set manually but the programmer if there is a need to
        alter the known user/item base. Defaults to None
    :type shape: Optional[tuple[int, int]], optional
    :param drop_unknown_user: To drop unknown users in the dataset,
        defaults to False
    :type drop_unknown_user: bool, optional
    :param drop_unknown_item: To drop unknown items in the dataset,
        defaults to False
    :type drop_unknown_item: bool, optional
    :param inherit_max_id: To inherit the maximum user and item ID from the
        given shape and the dataframe. This is useful when the shape is
        defined and the dataframe contains unknown users/items. Defaults to False
    :type inherit_max_id: bool, optional
    """

    logger.debug(
        f"(user x item) shape defined is {shape}. Shape of dataframe stored in matrix was {self._df.shape} before masking"
    )
    self.user_item_shape = shape
    logger.debug(f"Final (user x item) shape defined is {self.user_item_shape}")
    self._check_user_item_shape()

timestamps_gt(timestamp, inplace=False)

timestamps_gt(timestamp: float) -> T
timestamps_gt(
    timestamp: float, inplace: Literal[True]
) -> None

Select interactions after a given timestamp.

:param timestamp: The timestamp with which the interactions timestamp is compared. :type timestamp: float :param inplace: Apply the selection in place if True, defaults to False :type inplace: bool, optional :return: None if inplace, otherwise returns a new InteractionMatrix object :rtype: Union[InteractionMatrix, None]

Source code in src/recnexteval/matrix/filters.py
67
68
69
70
71
72
73
74
75
76
77
78
def timestamps_gt(self: T, timestamp: float, inplace: bool = False) -> None | T:
    """Select interactions after a given timestamp.

    :param timestamp: The timestamp with which
        the interactions timestamp is compared.
    :type timestamp: float
    :param inplace: Apply the selection in place if True, defaults to False
    :type inplace: bool, optional
    :return: None if `inplace`, otherwise returns a new InteractionMatrix object
    :rtype: Union[InteractionMatrix, None]
    """
    return self._timestamps_cmp(operator.gt, timestamp, inplace)

timestamps_gte(timestamp, inplace=False)

timestamps_gte(timestamp: float) -> T
timestamps_gte(
    timestamp: float, inplace: Literal[True]
) -> None

Select interactions after and including a given timestamp.

:param timestamp: The timestamp with which the interactions timestamp is compared. :type timestamp: float :param inplace: Apply the selection in place if True, defaults to False :type inplace: bool, optional :return: None if inplace, otherwise returns a new InteractionMatrix object :rtype: Union[InteractionMatrix, None]

Source code in src/recnexteval/matrix/filters.py
84
85
86
87
88
89
90
91
92
93
94
95
def timestamps_gte(self: T, timestamp: float, inplace: bool = False) -> None | T:
    """Select interactions after and including a given timestamp.

    :param timestamp: The timestamp with which
        the interactions timestamp is compared.
    :type timestamp: float
    :param inplace: Apply the selection in place if True, defaults to False
    :type inplace: bool, optional
    :return: None if `inplace`, otherwise returns a new InteractionMatrix object
    :rtype: Union[InteractionMatrix, None]
    """
    return self._timestamps_cmp(operator.ge, timestamp, inplace)

timestamps_lt(timestamp, inplace=False)

timestamps_lt(timestamp: float) -> T
timestamps_lt(
    timestamp: float, inplace: Literal[True]
) -> None

Select interactions up to a given timestamp.

:param timestamp: The timestamp with which the interactions timestamp is compared. :type timestamp: float :param inplace: Apply the selection in place if True, defaults to False :type inplace: bool, optional :return: None if inplace, otherwise returns a new InteractionMatrix object :rtype: Union[InteractionMatrix, None]

Source code in src/recnexteval/matrix/filters.py
101
102
103
104
105
106
107
108
109
110
111
112
def timestamps_lt(self: T, timestamp: float, inplace: bool = False) -> None | T:
    """Select interactions up to a given timestamp.

    :param timestamp: The timestamp with which
        the interactions timestamp is compared.
    :type timestamp: float
    :param inplace: Apply the selection in place if True, defaults to False
    :type inplace: bool, optional
    :return: None if `inplace`, otherwise returns a new InteractionMatrix object
    :rtype: Union[InteractionMatrix, None]
    """
    return self._timestamps_cmp(operator.lt, timestamp, inplace)

timestamps_lte(timestamp, inplace=False)

timestamps_lte(timestamp: float) -> T
timestamps_lte(
    timestamp: float, inplace: Literal[True]
) -> None

Select interactions up to and including a given timestamp.

:param timestamp: The timestamp with which the interactions timestamp is compared. :type timestamp: float :param inplace: Apply the selection in place if True, defaults to False :type inplace: bool, optional :return: None if inplace, otherwise returns a new InteractionMatrix object :rtype: Union[InteractionMatrix, None]

Source code in src/recnexteval/matrix/filters.py
118
119
120
121
122
123
124
125
126
127
128
129
def timestamps_lte(self: T, timestamp: float, inplace: bool = False) -> None | T:
    """Select interactions up to and including a given timestamp.

    :param timestamp: The timestamp with which
        the interactions timestamp is compared.
    :type timestamp: float
    :param inplace: Apply the selection in place if True, defaults to False
    :type inplace: bool, optional
    :return: None if `inplace`, otherwise returns a new InteractionMatrix object
    :rtype: Union[InteractionMatrix, None]
    """
    return self._timestamps_cmp(operator.le, timestamp, inplace)

get_users_n_last_interaction(n_seq_data=1, t_upper=None, user_in=None, inplace=False)

Source code in src/recnexteval/matrix/filters.py
143
144
145
146
147
148
def get_users_n_last_interaction(
    self: T, n_seq_data: int = 1, t_upper: None | int = None, user_in: None | set[int] = None, inplace: bool = False
) -> T:
    return self._get_last_n_interactions(
        by=ItemUserBasedEnum.USER, n_seq_data=n_seq_data, t_upper=t_upper, id_in=user_in, inplace=inplace
    )

get_items_n_last_interaction(n_seq_data=1, t_upper=None, item_in=None, inplace=False)

Source code in src/recnexteval/matrix/filters.py
150
151
152
153
def get_items_n_last_interaction(
    self: T, n_seq_data: int = 1, t_upper: None | int = None, item_in: None | set[int] = None, inplace: bool = False
) -> T:
    return self._get_last_n_interactions(by=ItemUserBasedEnum.ITEM, n_seq_data=n_seq_data, t_upper=t_upper, id_in=item_in, inplace=inplace)

get_users_n_first_interaction(n_seq_data=1, t_lower=None, inplace=False)

Source code in src/recnexteval/matrix/filters.py
155
156
157
158
def get_users_n_first_interaction(
    self: T, n_seq_data: int = 1, t_lower: None | int = None, inplace: bool = False
) -> T:
    return self._get_first_n_interactions(ItemUserBasedEnum.USER, n_seq_data, t_lower, inplace)

get_items_n_first_interaction(n_seq_data=1, t_lower=None, inplace=False)

Source code in src/recnexteval/matrix/filters.py
160
161
162
163
def get_items_n_first_interaction(
    self: T, n_seq_data: int = 1, t_lower: None | int = None, inplace: bool = False
) -> T:
    return self._get_first_n_interactions(ItemUserBasedEnum.ITEM, n_seq_data, t_lower, inplace)

users_in(U, inplace=False)

users_in(U: set[int]) -> T
users_in(U: set[int], inplace: Literal[False]) -> T
users_in(U: set[int], inplace: Literal[True]) -> None
Source code in src/recnexteval/matrix/filters.py
23
24
25
26
def users_in(self: T, U: set[int], inplace: bool = False) -> None | T:
    logger.debug("Performing users_in comparison")
    mask = self._df[self.USER_IX].isin(U)
    return self._apply_mask(mask, inplace=inplace)

users_not_in(U, inplace=False)

users_not_in(U: set[int]) -> T
users_not_in(U: set[int], inplace: Literal[False]) -> T
users_not_in(U: set[int], inplace: Literal[True]) -> None
Source code in src/recnexteval/matrix/filters.py
34
35
36
37
def users_not_in(self: T, U: set[int], inplace: bool = False) -> None | T:
    logger.debug("Performing users_not_in comparison")
    mask = ~self._df[self.USER_IX].isin(U)
    return self._apply_mask(mask, inplace=inplace)

items_in(id_set, inplace=False)

items_in(id_set: set[int]) -> T
items_in(id_set: set[int], inplace: Literal[False]) -> T
items_in(id_set: set[int], inplace: Literal[True]) -> None
Source code in src/recnexteval/matrix/filters.py
45
46
47
48
def items_in(self: T, id_set: set[int], inplace: bool = False) -> None | T:
    logger.debug("Performing items_in comparison")
    mask = self._df[self.ITEM_IX].isin(id_set)
    return self._apply_mask(mask, inplace=inplace)

items_not_in(id_set, inplace=False)

items_not_in(id_set: set[int]) -> T
items_not_in(
    id_set: set[int], inplace: Literal[False]
) -> T
items_not_in(
    id_set: set[int], inplace: Literal[True]
) -> None
Source code in src/recnexteval/matrix/filters.py
56
57
58
59
def items_not_in(self: T, id_set: set[int], inplace: bool = False) -> None | T:
    logger.debug("Performing items_not_in comparison")
    mask = ~self._df[self.ITEM_IX].isin(id_set)
    return self._apply_mask(mask, inplace=inplace)

copy()

Create a deep copy of this InteractionMatrix.

Source code in src/recnexteval/matrix/interaction_matrix.py
77
78
79
def copy(self) -> Self:
    """Create a deep copy of this InteractionMatrix."""
    return deepcopy(self)

copy_df(reset_index=False)

Create a deep copy of the dataframe.

Source code in src/recnexteval/matrix/interaction_matrix.py
81
82
83
84
85
def copy_df(self, reset_index: bool = False) -> "pd.DataFrame":
    """Create a deep copy of the dataframe."""
    if reset_index:
        return deepcopy(self._df.reset_index(drop=True))
    return deepcopy(self._df)

concat(im)

Concatenate this InteractionMatrix with another.

Note

This is a inplace operation. and will modify the current object.

Source code in src/recnexteval/matrix/interaction_matrix.py
87
88
89
90
91
92
93
94
95
96
97
98
def concat(self, im: "InteractionMatrix | pd.DataFrame") -> Self:
    """Concatenate this InteractionMatrix with another.

    Note:
        This is a inplace operation. and will modify the current object.
    """
    if isinstance(im, pd.DataFrame):
        self._df = pd.concat([self._df, im])
    else:
        self._df = pd.concat([self._df, im._df])

    return self

union(im)

Combine events from this InteractionMatrix with another.

Source code in src/recnexteval/matrix/interaction_matrix.py
101
102
103
def union(self, im: "InteractionMatrix") -> Self:
    """Combine events from this InteractionMatrix with another."""
    return self + im

difference(im)

Difference between this InteractionMatrix and another.

Source code in src/recnexteval/matrix/interaction_matrix.py
105
106
107
def difference(self, im: "InteractionMatrix") -> Self:
    """Difference between this InteractionMatrix and another."""
    return self - im

nonzero()

Source code in src/recnexteval/matrix/interaction_matrix.py
139
140
def nonzero(self) -> tuple[list[int], list[int]]:
    return self.values.nonzero()