Skip to content

lastfm

logger = logging.getLogger(__name__) module-attribute

LastFMDataset

Bases: Dataset

Last FM dataset.

The Last FM dataset contains user interactions with artists. The tags in this datasets are not used in this implementation. The dataset that will be used would the the user_taggedartists-timestamps.dat file. The dataset contains the following columns: [user, artist, tags, timestamp].

The dataset is downloaded from the GroupLens website :cite:Cantador_RecSys2011.

Source code in src/recnexteval/datasets/datasets/lastfm.py
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class LastFMDataset(Dataset):
    """
    Last FM dataset.

    The Last FM dataset contains user interactions with artists. The tags in this
    datasets are not used in this implementation. The dataset that will be used
    would the the user_taggedartists-timestamps.dat file. The dataset contains
    the following columns: [user, artist, tags, timestamp].

    The dataset is downloaded from the GroupLens website :cite:`Cantador_RecSys2011`.
    """
    IS_BASE: bool = False

    config: ClassVar[LastFMDatasetConfig] = LastFMDatasetConfig()

    ITEM_METADATA = None
    USER_METADATA = None
    TAG_METADATA = None

    def fetch_dataset(self) -> None:
        """Check if dataset is present, if not download.

        This method overrides the base class to handle the special case where
        the zipfile may exist but the extracted file doesn't.
        """
        zip_path = os.path.join(
            self.config.default_base_path, f"{self.config.remote_zipname}.zip"
        )

        if not os.path.exists(zip_path):
            logger.debug(f"{self.name} dataset zipfile not found in {zip_path}.")
            self._download_dataset()
        elif not os.path.exists(self.file_path):
            logger.debug(
                f"{self.name} dataset file not found, but zipfile already downloaded. "
                f"Extracting file from zipfile."
            )
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
        else:
            logger.debug("Data zipfile is in memory and in dir specified.")

    def _download_dataset(self) -> None:
        """Downloads the dataset.

        Downloads the zipfile, and extracts the interaction file to `self.file_path`
        """
        zip_path = os.path.join(
            self.config.default_base_path, f"{self.config.remote_zipname}.zip"
        )

        logger.debug(f"Downloading {self.name} dataset from {self.config.dataset_url}")

        # Download the zip into the data directory
        self._fetch_remote(
            f"{self.config.dataset_url}/{self.config.remote_zipname}.zip",
            zip_path,
        )

        # Extract the interaction file which we will use
        logger.debug(f"Extracting {self.config.remote_filename} from zip")
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extract(self.config.remote_filename, self.config.default_base_path)

    def _load_dataframe(self) -> pd.DataFrame:
        """Load the raw dataset from file, and return it as a pandas DataFrame.

        Transform the dataset downloaded to have integer user and item ids. This
        will be needed for representation in the interaction matrix.

        Returns:
            The interaction data as a DataFrame with a row per interaction.
        """
        self.fetch_dataset()
        df = pd.read_csv(
            self.file_path,
            dtype={
                self.config.item_ix: np.int32,
                self.config.user_ix: np.int32,
                self.config.tag_ix: np.int32,
                self.config.timestamp_ix: np.int64,
            },
            sep="\t",
            names=[
                self.config.user_ix,
                self.config.item_ix,
                self.config.tag_ix,
                self.config.timestamp_ix,
            ],
            header=0,
        )
        # Convert from milliseconds to seconds
        df[self.config.timestamp_ix] = df[self.config.timestamp_ix] // 1_000

        logger.debug(f"Loaded {len(df)} interactions")
        return df

    def _fetch_dataset_metadata(
        self, user_id_mapping: pd.DataFrame, item_id_mapping: pd.DataFrame
    ) -> None:
        self.USER_METADATA = LastFMUserMetadata(user_id_mapping=user_id_mapping).load()
        self.ITEM_METADATA = LastFMItemMetadata(item_id_mapping=item_id_mapping).load()
        self.TAG_METADATA = LastFMTagMetadata().load()

IS_BASE = False class-attribute instance-attribute

config = LastFMDatasetConfig() class-attribute

ITEM_METADATA = None class-attribute instance-attribute

USER_METADATA = None class-attribute instance-attribute

TAG_METADATA = None class-attribute instance-attribute

name property

Name of the object's class.

:return: Name of the object's class :rtype: str

file_path property

File path of the dataset.

processed_cache_path property

Path for cached processed data.

fetch_metadata = fetch_metadata instance-attribute

preprocessor = DataFramePreprocessor(item_ix=(self.config.item_ix), user_ix=(self.config.user_ix), timestamp_ix=(self.config.timestamp_ix)) instance-attribute

timestamp_min property

Minimum timestamp in the dataset.

Returns:

Type Description
int

Minimum timestamp in the dataset.

timestamp_max property

Maximum timestamp in the dataset.

Returns:

Type Description
int

Maximum timestamp in the dataset.

fetch_dataset()

Check if dataset is present, if not download.

This method overrides the base class to handle the special case where the zipfile may exist but the extracted file doesn't.

Source code in src/recnexteval/datasets/datasets/lastfm.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
def fetch_dataset(self) -> None:
    """Check if dataset is present, if not download.

    This method overrides the base class to handle the special case where
    the zipfile may exist but the extracted file doesn't.
    """
    zip_path = os.path.join(
        self.config.default_base_path, f"{self.config.remote_zipname}.zip"
    )

    if not os.path.exists(zip_path):
        logger.debug(f"{self.name} dataset zipfile not found in {zip_path}.")
        self._download_dataset()
    elif not os.path.exists(self.file_path):
        logger.debug(
            f"{self.name} dataset file not found, but zipfile already downloaded. "
            f"Extracting file from zipfile."
        )
        with zipfile.ZipFile(zip_path, "r") as zip_ref:
            zip_ref.extract(self.config.remote_filename, self.config.default_base_path)
    else:
        logger.debug("Data zipfile is in memory and in dir specified.")

fetch_dataset_force()

Force re-download of the dataset.

Source code in src/recnexteval/datasets/base.py
47
48
49
50
def fetch_dataset_force(self) -> None:
    """Force re-download of the dataset."""
    logger.debug(f"{self.name} force re-download of dataset.")
    self._download_dataset()

get_timestamp_range_in_epoch()

Get the minimum and maximum timestamps in the dataset.

Returns:

Type Description
tuple[int, int]

A tuple of (min_timestamp, max_timestamp).

Raises:

Type Description
RuntimeError

If load() has not been called yet.

Source code in src/recnexteval/datasets/datasets/base.py
129
130
131
132
133
134
135
136
137
138
def get_timestamp_range_in_epoch(self) -> tuple[int, int]:
    """Get the minimum and maximum timestamps in the dataset.

    Returns:
        A tuple of (min_timestamp, max_timestamp).

    Raises:
        RuntimeError: If load() has not been called yet.
    """
    return self.timestamp_min, self.timestamp_max

get_timestamp_range_in_datetime()

Get the minimum and maximum timestamps in the dataset.

Returns:

Type Description
tuple[datetime, datetime]

A tuple of (min_timestamp, max_timestamp).

Raises:

Type Description
RuntimeError

If load() has not been called yet.

Source code in src/recnexteval/datasets/datasets/base.py
140
141
142
143
144
145
146
147
148
149
150
151
def get_timestamp_range_in_datetime(self) -> tuple[datetime, datetime]:
    """Get the minimum and maximum timestamps in the dataset.

    Returns:
        A tuple of (min_timestamp, max_timestamp).

    Raises:
        RuntimeError: If load() has not been called yet.
    """
    min_dt = datetime.fromtimestamp(self.timestamp_min)
    max_dt = datetime.fromtimestamp(self.timestamp_max)
    return min_dt, max_dt

add_filter(filter_)

Add a filter to be applied when loading the data.

Utilize :class:DataFramePreprocessor class to add filters to the dataset to load. The filter will be applied when the data is loaded into an :class:InteractionMatrix object when :meth:load is called.

:param filter_: Filter to be applied to the loaded DataFrame processing to interaction matrix. :type filter_: Filter

Source code in src/recnexteval/datasets/datasets/base.py
153
154
155
156
157
158
159
160
161
162
163
164
def add_filter(self, filter_: Filter) -> None:
    """Add a filter to be applied when loading the data.

    Utilize :class:`DataFramePreprocessor` class to add filters to the
    dataset to load. The filter will be applied when the data is loaded into
    an :class:`InteractionMatrix` object when :meth:`load` is called.

    :param filter_: Filter to be applied to the loaded DataFrame
                processing to interaction matrix.
    :type filter_: Filter
    """
    self.preprocessor.add_filter(filter_)

load(apply_filters=True, use_cache=True)

Loads data into an InteractionMatrix object.

Data is loaded into a DataFrame using the :func:_load_dataframe function. Resulting DataFrame is parsed into an :class:InteractionMatrix object. If :data:apply_filters is set to True, the filters set will be applied to the dataset and mapping of user and item ids will be done. This is advised even if there is no filter set, as it will ensure that the user and item ids are incrementing in the order of time.

Parameters:

Name Type Description Default
apply_filters bool

To apply the filters set and preprocessing, defaults to True

True
use_cache bool

Whether to use cached processed data, defaults to True

True

Returns:

Type Description
InteractionMatrix

Resulting interaction matrix.

Source code in src/recnexteval/datasets/datasets/base.py
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
def load(self, apply_filters: bool = True, use_cache: bool = True) -> InteractionMatrix:
    """Loads data into an InteractionMatrix object.

    Data is loaded into a DataFrame using the :func:`_load_dataframe` function.
    Resulting DataFrame is parsed into an :class:`InteractionMatrix` object. If
    :data:`apply_filters` is set to True, the filters set will be applied to the
    dataset and mapping of user and item ids will be done. This is advised
    even if there is no filter set, as it will ensure that the user and item
    ids are incrementing in the order of time.

    Args:
        apply_filters: To apply the filters set and preprocessing,
            defaults to True
        use_cache: Whether to use cached processed data, defaults to True

    Returns:
        Resulting interaction matrix.
    """
    logger.info(f"{self.name} is loading dataset...")
    start = time.time()
    try:
        df = self._load_dataframe_from_cache() if use_cache else self._load_dataframe()
    except FileNotFoundError:
        logger.warning("Processed cache not found, loading raw dataframe.")
        df = self._load_dataframe()
        self._cache_processed_dataframe(df)
    if apply_filters:
        logger.debug(f"{self.name} applying filters set.")
        im = self.preprocessor.process(df)
    else:
        im = self._dataframe_to_matrix(df)
        logger.warning(
            "No filters applied, user and item ids may not be incrementing in the order of time. "
            "Classes that use this dataset may not work as expected."
        )
    self._timestamp_min = int(df[self.config.timestamp_ix].min())
    self._timestamp_max = int(df[self.config.timestamp_ix].max())

    if self.fetch_metadata:
        user_id_mapping, item_id_mapping = (
            self.preprocessor.user_id_mapping,
            self.preprocessor.item_id_mapping,
        )
        self._fetch_dataset_metadata(user_id_mapping=user_id_mapping, item_id_mapping=item_id_mapping)

    end = time.time()
    logger.info(f"{self.name} dataset loaded - Took {end - start:.3}s")
    return im